Litellm support in llama stack:

2025-02-03 06:15:09 -08:00
1129 changed files with 50998 additions and 480616 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -1,6 +0,0 @@
-[run]
-omit =
-    */tests/*
-    */llama_stack/providers/*
-    */llama_stack/templates/*
-    .venv/*
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,31 @@
+[flake8]
+# Suggested config from pytorch that we can adapt
+select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+# N812 ignored because import torch.nn.functional as F is PyTorch convention
+# N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
+# E731 allow usage of assigning lambda expressions
+# E701 let black auto-format statements on one line
+# E704 let black auto-format statements on one line
+ignore =
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704
+    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
+    # to line this up with executable bit
+    EXE001,
+    # random naming hints don't need
+    N802,
+    # these ignores are from flake8-bugbear; please fix!
+    B007,B008,B950
+optional-ascii-coding = True
+exclude =
+    ./.git,
+    ./docs/*,
+    ./build,
+    ./scripts,
+    ./venv,
+    *.pyi,
+    .pre-commit-config.yaml,
+    *.md,
+    .flake8
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721
--- a/.github/ISSUE_TEMPLATE/bug.yml
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@ -1,6 +1,6 @@
 name: 🐛 Bug Report
 description: Create a report to help us reproduce and fix the bug
-labels: ["bug"]
+
 body:
  - type: markdown
    attributes:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,12 +0,0 @@
-blank_issues_enabled: false
-
-contact_links:
-  - name: Have you read the docs?
-    url: https://llama-stack.readthedocs.io/en/latest/index.html
-    about: Much help can be found in the docs
-  - name: Start a discussion
-    url: https://github.com/meta-llama/llama-stack/discussions/new
-    about: Start a discussion on a topic
-  - name: Chat on Discord
-    url: https://discord.gg/llama-stack
-    about: Maybe chatting with the community can help
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -1,6 +1,6 @@
 name: 🚀 Feature request
 description: Request a new llama-stack feature
-labels: ["enhancement"]
+
 body:
 - type: textarea
  id: feature-pitch
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,8 +1,27 @@
 # What does this PR do?
-<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->

-<!-- If resolving an issue, uncomment and update the line below -->
-<!-- Closes #[issue-number] -->
+In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue.
+
+- [ ] Addresses issue (#issue)
+

 ## Test Plan
-<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
+
+Please describe:
+ - tests you ran to verify your changes with result summaries.
+ - provide instructions so it can be reproduced.
+
+
+## Sources
+
+Please link relevant resources if necessary.
+
+
+## Before submitting
+
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Ran pre-commit to handle lint / formatting issues.
+- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
+      Pull Request section?
+- [ ] Updated relevant documentation.
+- [ ] Wrote necessary unit or integration tests.
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +0,0 @@
-# This file documents Triage members in the Llama Stack community
- @bbrowning @booxter @franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,26 +0,0 @@
-name: Setup Ollama
-description: Start Ollama and cache model
-inputs:
-  models:
-    description: Comma-separated list of models to pull
-    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
-runs:
-  using: "composite"
-  steps:
-    - name: Install and start Ollama
-      shell: bash
-      run: |
-        # the ollama installer also starts the ollama service
-        curl -fsSL https://ollama.com/install.sh | sh
-
-    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
-    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
-    # pull them directly.
-    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
-    - name: Pull requested models
-      if: inputs.models != ''
-      shell: bash
-      run: |
-        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
-          ollama pull "$model"
-        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,22 +0,0 @@
-name: Setup runner
-description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
-runs:
-  using: "composite"
-  steps:
-    - name: Install uv
-      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-      with:
-        python-version: "3.10"
-        activate-environment: true
-        version: 0.7.6
-
-    - name: Install dependencies
-      shell: bash
-      run: |
-        uv sync --all-groups
-        uv pip install ollama faiss-cpu
-        # always test against the latest version of the client
-        # TODO: this is not necessarily a good idea. we need to test against both published and latest
-        # to find out backwards compatibility issues.
-        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
-        uv pip install -e .
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,23 +0,0 @@
-# GitHub Dependabot configuration
-version: 2
-updates:
-  # Enable version updates for GitHub Actions
-  - package-ecosystem: "github-actions"
-    directory: "/" # Will use the default workflow location of `.github/workflows`
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    commit-message:
-      prefix: chore(github-deps)
-  - package-ecosystem: "uv"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
-    open-pull-requests-limit: 0
-    labels:
-      - type/dependencies
-      - python
-    commit-message:
-      prefix: chore(python-deps)
--- a/.github/workflows/Dockerfile
+++ b/.github/workflows/Dockerfile
@ -1 +0,0 @@
-FROM localhost:5000/distribution-kvant:dev
--- a/.github/workflows/ci-playground.yaml
+++ b/.github/workflows/ci-playground.yaml
@ -1,73 +0,0 @@
-name: Build and Push playground container
-run-name: Build and Push playground container
-on:
-  workflow_dispatch:
-  #schedule:
-  #  - cron: "0 10 * * *"
-  push:
-    branches:
-      - main
-      - kvant
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-      - kvant
-env:
-  IMAGE: git.kvant.cloud/${{github.repository}}-playground
-jobs:
-  build-playground:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set current time
-        uses: https://github.com/gerred/actions/current-time@master
-        id: current_time
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to git.kvant.cloud registry
-        uses: docker/login-action@v3
-        with:
-          registry: git.kvant.cloud
-          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
-          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
-      
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          # list of Docker images to use as base name for tags
-          images: |
-            ${{env.IMAGE}}
-          # generate Docker tags based on the following events/attributes
-          tags: |
-            type=schedule
-            type=ref,event=branch
-            type=ref,event=pr
-            type=ref,event=tag
-            type=semver,pattern={{version}}
-
-      - name: Build and push to gitea registry
-        uses: docker/build-push-action@v6
-        with:
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          context: .
-          file: llama_stack/distribution/ui/Containerfile
-          provenance: mode=max
-          sbom: true
-          build-args: |
-            BUILD_DATE=${{ steps.current_time.outputs.time }}
-          cache-from: |
-            type=registry,ref=${{ env.IMAGE }}:buildcache
-            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
-            type=registry,ref=${{ env.IMAGE }}:main
-          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -1,98 +0,0 @@
-name: Build and Push container
-run-name: Build and Push container
-on:
-  workflow_dispatch:
-  #schedule:
-  #  - cron: "0 10 * * *"
-  push:
-    branches:
-      - main
-      - kvant
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-      - kvant
-env:
-  IMAGE: git.kvant.cloud/${{github.repository}}
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    services:
-      registry:
-        image: registry:2
-        ports:
-          - 5000:5000
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set current time
-        uses: https://github.com/gerred/actions/current-time@master
-        id: current_time
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          driver-opts: network=host
-
-      - name: Login to git.kvant.cloud registry
-        uses: docker/login-action@v3
-        with:
-          registry: git.kvant.cloud
-          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
-          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
-      
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          # list of Docker images to use as base name for tags
-          images: |
-            ${{env.IMAGE}}
-          # generate Docker tags based on the following events/attributes
-          tags: |
-            type=schedule
-            type=ref,event=branch
-            type=ref,event=pr
-            type=ref,event=tag
-            type=semver,pattern={{version}}
-
-      - name: Install uv
-        uses: https://github.com/astral-sh/setup-uv@v5
-        with:
-          # Install a specific version of uv.
-          version: "0.7.8"
-            
-      - name: Build
-        env:
-          USE_COPY_NOT_MOUNT: true
-          LLAMA_STACK_DIR: .
-        run: |
-          uvx --from . llama stack build --template kvant --image-type container
-
-          # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
-          # docker push ${{env.IMAGE}}:kvant
-
-          docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
-          docker push localhost:5000/distribution-kvant:dev
-
-      - name: Build and push to gitea registry
-        uses: docker/build-push-action@v6
-        with:
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          context: .github/workflows
-          provenance: mode=max
-          sbom: true
-          build-args: |
-            BUILD_DATE=${{ steps.current_time.outputs.time }}
-          cache-from: |
-            type=registry,ref=${{ env.IMAGE }}:buildcache
-            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
-            type=registry,ref=${{ env.IMAGE }}:main
-          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
@ -140,7 +140,7 @@ jobs:
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.branch }}

@ -302,7 +302,7 @@ jobs:
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
@ -310,7 +310,7 @@ jobs:
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@v3
        with:
          name: test-summary
          path: test-summary.md
@ -320,7 +320,7 @@ jobs:
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
+        uses: thollander/actions-comment-pull-request@v2
        with:
          filePath: test-summary.md

@ -350,6 +350,6 @@ jobs:
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,25 @@
+name: Pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+
+      - name: Set up Python
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1
--- a/.github/workflows/publish-to-docker.yml
+++ b/.github/workflows/publish-to-docker.yml
@ -0,0 +1,148 @@
+name: Docker Build and Publish
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'TestPyPI or PyPI version to build (e.g., 0.0.63.dev20250114)'
+        required: true
+        type: string
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Set version
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "push" ]; then
+            echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
+          else
+            echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Check package version availability
+        run: |
+            # Function to check if version exists in a repository
+            check_version() {
+                local repo=$1
+                local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
+                echo "Checking version $VERSION_TO_CHECK in $repo"
+                result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
+                echo "Result: $result"
+                return $([ "$result" = "true" ])
+            }
+
+            # Check TestPyPI first, then PyPI
+            if check_version "test.pypi"; then
+                echo "Version ${{ steps.version.outputs.version }} found in TestPyPI"
+                echo "PYPI_SOURCE=testpypi" >> $GITHUB_ENV
+            elif check_version "pypi"; then
+                echo "Version ${{ steps.version.outputs.version }} found in PyPI"
+                echo "PYPI_SOURCE=pypi" >> $GITHUB_ENV
+            else
+                echo "Error: Version ${{ steps.version.outputs.version }} not found in either TestPyPI or PyPI"
+                exit 1
+            fi
+
+      - name: Install llama-stack
+        run: |
+            echo "PYPI_SOURCE=${PYPI_SOURCE}"
+            if [ "${{ github.event_name }}" = "push" ]; then
+                pip install -e .
+            else
+                if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                    pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple llama-stack==${{ steps.version.outputs.version }}
+                else
+                    pip install llama-stack==${{ steps.version.outputs.version }}
+                fi
+            fi
+
+      - name: Build docker image
+        run: |
+          echo "PYPI_SOURCE=${PYPI_SOURCE}"
+          echo "VERSION=${{ steps.version.outputs.version }}"
+          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
+          for template in "${TEMPLATES[@]}"; do
+            if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
+            else
+                PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
+            fi
+          done
+
+      - name: List docker images
+        run: |
+          docker images
+
+      # TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
+      - name: Start up built docker image
+        run: |
+          cd distributions/fireworks
+          if [ "$PYPI_SOURCE" = "testpypi" ]; then
+            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
+          else
+            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
+          fi
+          docker compose up -d
+          cd ..
+          # Wait for the container to start
+          timeout=300
+          while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
+            echo "Waiting for endpoint to be available..."
+            sleep 5
+            timeout=$((timeout - 5))
+          done
+
+          if [ $timeout -le 0 ]; then
+            echo "Timeout waiting for endpoint to become available"
+            exit 1
+          fi
+
+      - name: Run simple models list test on docker server
+        run: |
+          curl http://localhost:8321/v1/models
+
+      # TODO (xiyan): figure out why client cannot find server but curl works
+      # - name: Run pytest on docker server
+      #   run: |
+      #     pip install pytest pytest-md-report
+      #     export LLAMA_STACK_BASE_URL="http://localhost:8321"
+      #     LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
+
+      - name: Push to dockerhub
+        run: |
+          echo "PYPI_SOURCE=${PYPI_SOURCE}"
+          echo "VERSION=${{ steps.version.outputs.version }}"
+          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
+          for template in "${TEMPLATES[@]}"; do
+            if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                docker tag distribution-$template:test-${{ steps.version.outputs.version }} llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
+                docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
+            else
+                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
+                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
+                docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
+                docker push llamastack/distribution-$template:latest
+            fi
+          done
--- a/.github/workflows/publish-to-test-pypi.yml
+++ b/.github/workflows/publish-to-test-pypi.yml
@ -0,0 +1,244 @@
+name: Publish Python 🐍 distribution 📦 to TestPyPI
+
+on:
+  workflow_dispatch:  # Keep manual trigger
+    inputs:
+      version:
+        description: 'Version number (e.g. 0.0.63.dev20250111)'
+        required: true
+        type: string
+  schedule:
+    - cron: "0 0 * * *"  # Run every day at midnight
+
+jobs:
+  trigger-client-and-models-build:
+    name: Trigger llama-stack-client and llama-models build
+    runs-on: ubuntu-latest
+    outputs:
+      version: ${{ steps.version.outputs.version }}
+      client_run_id: ${{ steps.trigger-client.outputs.workflow_id }}
+      model_run_id: ${{ steps.trigger-models.outputs.workflow_id }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Get date
+      id: date
+      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+    - name: Compute version based on dispatch event
+      id: version
+      run: |
+        # Read base version from pyproject.toml
+        version=$(sed -n 's/.*version="\([^"]*\)".*/\1/p' setup.py)
+        if [ "${{ github.event_name }}" = "schedule" ]; then
+          echo "version=${version}.dev${{ steps.date.outputs.date }}" >> $GITHUB_OUTPUT
+        elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+          echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
+        else
+          echo "version=${version}.dev$(shuf -i 10000000-99999999 -n 1)" >> $GITHUB_OUTPUT
+        fi
+    - name: Trigger llama-stack-client workflow
+      id: trigger-client
+      run: |
+        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-stack-client-python/dispatches \
+        -H 'Accept: application/vnd.github.everest-preview+json' \
+        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+        --data "{\"event_type\": \"build-client-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
+        -w "\n%{http_code}")
+
+        http_code=$(echo "$response" | tail -n1)
+        if [ "$http_code" != "204" ]; then
+          echo "Failed to trigger client workflow"
+          exit 1
+        fi
+
+        # Get the run ID of the triggered workflow
+        sleep 5  # Wait for workflow to be created
+        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                 "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs?event=repository_dispatch" \
+                 | jq '.workflow_runs[0].id')
+        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
+
+    - name: Trigger llama-models workflow
+      id: trigger-models
+      run: |
+        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-models/dispatches \
+        -H 'Accept: application/vnd.github.everest-preview+json' \
+        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+        --data "{\"event_type\": \"build-models-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
+        -w "\n%{http_code}")
+
+        http_code=$(echo "$response" | tail -n1)
+        if [ "$http_code" != "204" ]; then
+          echo "Failed to trigger models workflow"
+          exit 1
+        fi
+
+        # Get the run ID of the triggered workflow
+        sleep 5  # Wait for workflow to be created
+        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                 "https://api.github.com/repos/meta-llama/llama-models/actions/runs?event=repository_dispatch" \
+                 | jq '.workflow_runs[0].id')
+        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
+
+  wait-for-workflows:
+    name: Wait for triggered workflows
+    needs: trigger-client-and-models-build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Wait for client workflow
+      run: |
+        while true; do
+          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                   "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
+                   | jq -r '.status')
+          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                      "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
+                      | jq -r '.conclusion')
+
+          echo "llama-stack-client-python workflow status: $status, conclusion: $conclusion"
+
+          if [ "$status" = "completed" ]; then
+            if [ "$conclusion" != "success" ]; then
+              echo "llama-stack-client-python workflow failed"
+              exit 1
+            fi
+            break
+          fi
+
+          sleep 10
+        done
+
+    - name: Wait for models workflow
+      run: |
+        while true; do
+          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                   "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
+                   | jq -r '.status')
+          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                      "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
+                      | jq -r '.conclusion')
+
+          echo "llama-models workflow status: $status, conclusion: $conclusion"
+
+          if [ "$status" = "completed" ]; then
+            if [ "$conclusion" != "success" ]; then
+              echo "llama-models workflow failed"
+              exit 1
+            fi
+            break
+          fi
+
+          sleep 10
+        done
+
+  build:
+    name: Build distribution 📦
+    needs:
+      - wait-for-workflows
+      - trigger-client-and-models-build
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Get date
+      id: date
+      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+    - name: Update version for nightly
+      run: |
+        sed -i 's/version="\([^"]*\)"/version="${{ needs.trigger-client-and-models-build.outputs.version }}"/' setup.py
+        sed -i 's/llama-stack-client>=\([^"]*\)/llama-stack-client==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
+        sed -i 's/llama-models>=\([^"]*\)/llama-models==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-testpypi:
+    name: Publish Python 🐍 distribution 📦 to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testrelease
+      url: https://test.pypi.org/p/llama-stack
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
+
+  test-published-package:
+    name: Test published package
+    needs:
+      - publish-to-testpypi
+      - trigger-client-and-models-build
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Install the package
+      run: |
+        max_attempts=6
+        attempt=1
+        while [ $attempt -le $max_attempts ]; do
+          echo "Attempt $attempt of $max_attempts to install package..."
+          if pip install --no-cache --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ llama-stack==${{ needs.trigger-client-and-models-build.outputs.version }}; then
+            echo "Package installed successfully"
+            break
+          fi
+          if [ $attempt -ge $max_attempts ]; then
+            echo "Failed to install package after $max_attempts attempts"
+            exit 1
+          fi
+          attempt=$((attempt + 1))
+          sleep 10
+        done
+    - name: Test the package versions
+      run: |
+        pip list | grep llama_
+    - name: Test CLI commands
+      run: |
+        llama model list
+        llama stack build --list-templates
+        llama model prompt-format -m Llama3.2-11B-Vision-Instruct
+        llama stack list-apis
+        llama stack list-providers inference
+        llama stack list-providers telemetry
+    - name: Test Notebook
+      run: |
+        pip install pytest nbval
+        llama stack build --template together --image-type venv
+        pytest -v -s --nbval-lax ./docs/getting_started.ipynb
+        pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+
+    # TODO: add trigger for integration test workflow & docker builds
--- a/.github/workflows_upstream/tests.yml
+++ b/.github/workflows_upstream/tests.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        provider: [fireworks, together]
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.commit_sha }}

@ -54,7 +54,7 @@ jobs:
          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"

          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT"

      - name: Output reports to the job summary
        if: always()
--- a/.github/workflows_upstream/changelog.yml
+++ b/.github/workflows_upstream/changelog.yml
@ -1,29 +0,0 @@
-name: Update Changelog
-
-on:
-  release:
-    types: [published, unpublished, created, edited, deleted, released]
-
-permissions:
-  contents: read
-
-jobs:
-  generate_changelog:
-    name: Generate changelog
-    permissions:
-      contents: write  # for peter-evans/create-pull-request to create branch
-      pull-requests: write  # for peter-evans/create-pull-request to create a PR
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: main
-          fetch-depth: 0
-      - run: |
-          python ./scripts/gen-changelog.py
-      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
-        with:
-          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          branch: create-pull-request/changelog
-          signoff: true
--- a/.github/workflows_upstream/install-script-ci.yml
+++ b/.github/workflows_upstream/install-script-ci.yml
@ -1,26 +0,0 @@
-name: Installer CI
-
-on:
-  pull_request:
-    paths:
-      - 'install.sh'
-  push:
-    paths:
-      - 'install.sh'
-  schedule:
-    - cron: '0 2 * * *'  # every day at 02:00 UTC
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
-      - name: Run ShellCheck on install.sh
-        run: shellcheck install.sh
-  smoke-test:
-    needs: lint
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
-      - name: Run installer end-to-end
-        run: ./install.sh
--- a/.github/workflows_upstream/integration-auth-tests.yml
+++ b/.github/workflows_upstream/integration-auth-tests.yml
@ -1,132 +0,0 @@
-name: Integration Auth Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'distributions/**'
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-auth-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        auth-provider: [oauth2_token]
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build Llama Stack
-        run: |
-          llama stack build --template ollama --image-type venv
-
-      - name: Install minikube
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
-
-      - name: Start minikube
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          minikube start
-          kubectl get pods -A
-
-      - name: Configure Kube Auth
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          kubectl create namespace llama-stack
-          kubectl create serviceaccount llama-stack-auth -n llama-stack
-          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
-          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
-          cat <<EOF | kubectl apply -f -
-          apiVersion: rbac.authorization.k8s.io/v1
-          kind: ClusterRole
-          metadata:
-            name: allow-anonymous-openid
-          rules:
-          - nonResourceURLs: ["/openid/v1/jwks"]
-            verbs: ["get"]
-          ---
-          apiVersion: rbac.authorization.k8s.io/v1
-          kind: ClusterRoleBinding
-          metadata:
-            name: allow-anonymous-openid
-          roleRef:
-            apiGroup: rbac.authorization.k8s.io
-            kind: ClusterRole
-            name: allow-anonymous-openid
-          subjects:
-          - kind: User
-            name: system:anonymous
-            apiGroup: rbac.authorization.k8s.io
-          EOF
-
-      - name: Set Kubernetes Config
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
-          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
-          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
-          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
-
-      - name: Set Kube Auth Config and run server
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          run_dir=$(mktemp -d)
-          cat <<'EOF' > $run_dir/run.yaml
-          version: '2'
-          image_name: kube
-          apis: []
-          providers: {}
-          server:
-            port: 8321
-          EOF
-          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
-          cat $run_dir/run.yaml
-
-          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
-                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
-                exit 0
-              else
-                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
-                cat server.log
-                exit 1
-              fi
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Test auth
-        run: |
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows_upstream/integration-tests.yml
+++ b/.github/workflows_upstream/integration-tests.yml
@ -1,116 +0,0 @@
-name: Integration Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        # Listing tests manually since some of them currently fail
-        # TODO: generate matrix list from tests/integration when fixed
-        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
-        client-type: [library, http]
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Setup ollama
-        uses: ./.github/actions/setup-ollama
-
-      - name: Build Llama Stack
-        run: |
-          llama stack build --template ollama --image-type venv
-
-      - name: Start Llama Stack server in background
-        if: matrix.client-type == 'http'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
-
-      - name: Wait for Llama Stack server to be ready
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Verify Ollama status is OK
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Verifying Ollama status..."
-          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
-          echo "Ollama status: $ollama_status"
-          if [ "$ollama_status" != "OK" ]; then
-            echo "Ollama health check failed"
-            exit 1
-          fi
-
-      - name: Check Storage and Memory Available Before Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Run Integration Tests
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          if [ "${{ matrix.client-type }}" == "library" ]; then
-            stack_config="ollama"
-          else
-            stack_config="http://localhost:8321"
-          fi
-          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
-            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
-            --embedding-model=all-MiniLM-L6-v2
-
-      - name: Check Storage and Memory Available After Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Write ollama logs to file
-        if: ${{ always() }}
-        run: |
-          sudo journalctl -u ollama.service > ollama.log
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows_upstream/pre-commit.yml
+++ b/.github/workflows_upstream/pre-commit.yml
@ -1,45 +0,0 @@
-name: Pre-commit
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.11'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Verify if there are any diff files after pre-commit
-        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
-
-      - name: Verify if there are any new files after pre-commit
-        run: |
-          unstaged_files=$(git ls-files --others --exclude-standard)
-          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
-            echo "$unstaged_files"
-            exit 1
-          fi
--- a/.github/workflows_upstream/providers-build.yml
+++ b/.github/workflows_upstream/providers-build.yml
@ -1,147 +0,0 @@
-name: Test Llama Stack Build
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
-      - 'llama_stack/distribution/*.sh'
-      - '.github/workflows/providers-build.yml'
-  pull_request:
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
-      - 'llama_stack/distribution/*.sh'
-      - '.github/workflows/providers-build.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      templates: ${{ steps.set-matrix.outputs.templates }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Generate Template List
-        id: set-matrix
-        run: |
-          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "templates=$templates" >> "$GITHUB_OUTPUT"
-
-  build:
-    needs: generate-matrix
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
-        image-type: [venv, container]
-      fail-fast: false # We want to run all jobs even if some fail
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Print build dependencies
-        run: |
-          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
-
-      - name: Run Llama Stack Build
-        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
-
-      - name: Print dependencies in the image
-        if: matrix.image-type == 'venv'
-        run: |
-          uv pip list
-
-  build-single-provider:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
-
-  build-custom-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
-          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
-
-      - name: Inspect the container image entrypoint
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-  build-ubi9-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Pin template to UBI9 base
-        run: |
-          yq -i '
-            .image_type    = "container" |
-            .image_name    = "ubi9-test" |
-            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/templates/starter/build.yaml
-
-      - name: Build dev container (UBI9)
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
-        run: |
-          uv run llama stack build --config llama_stack/templates/starter/build.yaml
-
-      - name: Inspect UBI9 image
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-          echo "Checking /etc/os-release in $IMAGE_ID"
-          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
-              'source /etc/os-release && echo "$ID"' \
-              | grep -qE '^(rhel|ubi)$' \
-              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows_upstream/semantic-pr.yml
+++ b/.github/workflows_upstream/semantic-pr.yml
@ -1,25 +0,0 @@
-name: Check semantic PR titles
-
-on:
-  pull_request_target:
-    types:
-      - opened
-      - edited
-      - reopened
-      - synchronize
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  title-check:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows_upstream/stale_bot.yml
+++ b/.github/workflows_upstream/stale_bot.yml
@ -1,45 +0,0 @@
-name: Close stale issues and PRs
-
-on:
-  schedule:
-    - cron: '0 0 * * *' # every day at midnight
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  stale:
-    permissions:
-      issues: write
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stale Action
-        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
-        with:
-          stale-issue-label: 'stale'
-          stale-issue-message: >
-            This issue has been automatically marked as stale because it has not had activity within 60 days.
-            It will be automatically closed if no further activity occurs within 30 days.
-          close-issue-message: >
-            This issue has been automatically closed due to inactivity.
-            Please feel free to reopen if you feel it is still relevant!
-          days-before-issue-stale: 60
-          days-before-issue-close: 30
-          stale-pr-label: 'stale'
-          stale-pr-message: >
-            This pull request has been automatically marked as stale because it has not had activity within 60 days.
-            It will be automatically closed if no further activity occurs within 30 days.
-          close-pr-message: >
-            This pull request has been automatically closed due to inactivity.
-            Please feel free to reopen if you intend to continue working on it!
-          days-before-pr-stale: 60
-          days-before-pr-close: 30
-          operations-per-run: 300
--- a/.github/workflows_upstream/test-external-providers.yml
+++ b/.github/workflows_upstream/test-external-providers.yml
@ -1,71 +0,0 @@
-name: Test External Providers
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/test-external-providers.yml' # This workflow
-
-jobs:
-  test-external-providers:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Apply image type to config file
-        run: |
-          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-
-      - name: Setup directory for Ollama custom provider
-        run: |
-          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
-          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
-
-      - name: Create provider configuration
-        run: |
-          mkdir -p /home/runner/.llama/providers.d/remote/inference
-          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          uv run pip list
-          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          for i in {1..30}; do
-            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
-              echo "Waiting for Llama Stack server to load the provider..."
-              sleep 1
-            else
-              echo "Provider loaded"
-              exit 0
-            fi
-          done
-          echo "Provider failed to load"
-          cat server.log
-          exit 1
--- a/.github/workflows_upstream/unit-tests.yml
+++ b/.github/workflows_upstream/unit-tests.yml
@ -1,52 +0,0 @@
-name: Unit Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/unit/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/unit-tests.yml' # This workflow
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python:
-          - "3.10"
-          - "3.11"
-          - "3.12"
-          - "3.13"
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Run unit tests
-        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
-
-      - name: Upload test results
-        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: test-results-${{ matrix.python }}
-          path: |
-            .pytest_cache/
-            pytest-report-${{ matrix.python }}.xml
-            htmlcov-${{ matrix.python }}/
-          retention-days: 7
--- a/.github/workflows_upstream/update-readthedocs.yml
+++ b/.github/workflows_upstream/update-readthedocs.yml
@ -1,68 +0,0 @@
-name: Update ReadTheDocs
-
-on:
-  workflow_dispatch:
-    inputs:
-      branch:
-        description: 'RTD version to update'
-        required: false
-        default: 'latest'
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-    tags:
-      - '*'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  update-readthedocs:
-    runs-on: ubuntu-latest
-    env:
-      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build HTML
-        run: |
-          cd docs
-          uv run make html
-
-      - name: Trigger ReadTheDocs build
-        if: github.event_name != 'pull_request'
-        run: |
-          if [ -z "$TOKEN" ]; then
-            echo "READTHEDOCS_TOKEN is not set"
-            exit 1
-          fi
-
-          response=$(curl -X POST \
-            -H "Content-Type: application/json" \
-            -d "{
-              \"token\": \"$TOKEN\",
-              \"version\": \"$GITHUB_REF_NAME\"
-            }" \
-            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
-
-          echo "Response: $response"
-          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
-            echo "Failed to trigger ReadTheDocs build"
-            exit 1
-          fi
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,6 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
-.cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -20,8 +19,3 @@ Package.resolved
 _build
 docs/src
 pyrightconfig.json
-venv/
-pytest-report.xml
-.coverage
-.python-version
-data
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "llama_stack/providers/impls/ios/inference/executorch"]
+	path = llama_stack/providers/inline/ios/inference/executorch
+	url = https://github.com/pytorch/executorch
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,28 +5,19 @@ default_language_version:

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0  # Latest stable version
+    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
    hooks:
-    -   id: check-merge-conflict
-        args: ['--assume-in-merge']
    -   id: trailing-whitespace
-        exclude: '\.py$'  # Exclude Python files as Ruff already handles them
+    -   id: check-ast
+    -   id: check-merge-conflict
    -   id: check-added-large-files
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
-    -   id: no-commit-to-branch
-    -   id: check-yaml
-        args: ["--unsafe"]
-    -   id: detect-private-key
-    -   id: requirements-txt-fixer
-    -   id: mixed-line-ending
-        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
-    -   id: check-executables-have-shebangs
-    -   id: check-json
-    -   id: check-shebang-scripts-are-executable
-    -   id: check-symlinks
-    -   id: check-toml
+
+# Temporarily disabling this
+#    -   id: no-commit-to-branch
+#        args: ['--branch=main']

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
@ -37,46 +28,29 @@ repos:
          - --license-filepath
          - docs/license_header.txt

-   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.9.4
+-   repo: https://github.com/pycqa/flake8
+    rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
    hooks:
-    -   id: ruff
-        args: [ --fix ]
-        exclude: ^llama_stack/strong_typing/.*$
-    -   id: ruff-format
-
-   repo: https://github.com/adamchainz/blacken-docs
-    rev: 1.19.0
-    hooks:
-    -   id: blacken-docs
+    -   id: flake8
        additional_dependencies:
-        - black==24.3.0
+          - flake8-bugbear == 22.4.25
+          - pep8-naming == 0.12.1
+          - torchfix
+        args: ['--config=.flake8']

-   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.8
+-   repo: https://github.com/omnilib/ufmt
+    rev: v2.7.0
    hooks:
-    -   id: uv-lock
-    -   id: uv-export
-        args: [
-            "--frozen",
-            "--no-hashes",
-            "--no-emit-project",
-            "--no-default-groups",
-            "--output-file=requirements.txt"
-        ]
-
-   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
-    hooks:
-    -   id: mypy
+    -   id: ufmt
        additional_dependencies:
-          - uv==0.6.2
-          - mypy
-          - pytest
-          - rich
-          - types-requests
-          - pydantic
-        pass_filenames: false
+          - black == 24.4.2
+          - usort == 1.0.8
+
+# - repo: https://github.com/jsh9/pydoclint
+#   rev: d88180a8632bb1602a4d81344085cf320f288c5a
+#   hooks:
+#     - id: pydoclint
+#       args: [--config=pyproject.toml]

 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
@ -84,35 +58,16 @@ repos:
 #     - id: markdown-link-check
 #       args: ['--quiet']

-   repo: local
-    hooks:
-      - id: distro-codegen
-        name: Distribution Template Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/distro_codegen.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
-      - id: openapi-codegen
-        name: API Spec Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/apis/|^docs/openapi_generator/
-      - id: check-workflows-use-hashes
-        name: Check GitHub Actions use SHA-pinned actions
-        entry: ./scripts/check-workflows-use-hashes.sh
-        language: system
-        pass_filenames: false
-        require_serial: true
-        always_run: true
-        files: ^\.github/workflows/.*\.ya?ml$
-
-ci:
-    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
-    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
+# -   repo: local
+#     hooks:
+#       - id: distro-codegen
+#         name: Distribution Template Codegen
+#         additional_dependencies:
+#           - rich
+#           - pydantic
+#         entry: python -m llama_stack.scripts.distro_codegen
+#         language: python
+#         pass_filenames: false
+#         require_serial: true
+#         files: ^llama_stack/templates/.*$
+#         stages: [manual]
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,21 +5,28 @@
 # Required
 version: 2

-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-  jobs:
-    pre_create_environment:
-      - asdf plugin add uv
-      - asdf install uv latest
-      - asdf global uv latest
-    create_environment:
-      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
-    install:
-      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,450 +1,6 @@
 # Changelog

-# v0.2.7
-Published on: 2025-05-16T20:38:10Z
-
-## Highlights
-
-This is a small update. But a couple highlights:
-
-* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
-* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
-* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
-
-
---
-
-# v0.2.6
-Published on: 2025-05-12T18:06:52Z
-
-
-
---
-
-# v0.2.5
-Published on: 2025-05-04T20:16:49Z
-
-
-
---
-
-# v0.2.4
-Published on: 2025-04-29T17:26:01Z
-
-## Highlights
-
-* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
-* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
-* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
-* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
-* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
-
-
---
-
-# v0.2.3
-Published on: 2025-04-25T22:46:21Z
-
-## Highlights
-
-* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
-* significant improvements and functionality added to the nVIDIA distribution
-* many improvements to the test verification suite.
-* new inference providers: Ramalama, IBM WatsonX
-* many improvements to the Playground UI
-
-
---
-
-# v0.2.2
-Published on: 2025-04-13T01:19:49Z
-
-## Main changes
-
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
- OpenAI compatible inference API in progress (@bbrowning)
- Provider verifications (@ehhuang)
- Many updates and fixes to playground
- Several llama4 related fixes
-
-
---
-
-# v0.2.1
-Published on: 2025-04-05T23:13:00Z
-
-
-
---
-
-# v0.2.0
-Published on: 2025-04-05T19:04:29Z
-
-## Llama 4 Support
-
-Checkout more at https://www.llama.com
-
-
-
---
-
-# v0.1.9
-Published on: 2025-03-29T00:52:23Z
-
-### Build and Test Agents
-* Agents: Entire document context with attachments
-* RAG: Documentation with sqlite-vec faiss comparison
-* Getting started: Fixes to getting started notebook.
-
-### Agent Evals and Model Customization
-* (**New**) Post-training: Add nemo customizer
-
-### Better Engineering
-* Moved sqlite-vec to non-blocking calls
-* Don't return a payload on file delete
-
-
-
---
-
-# v0.1.8
-Published on: 2025-03-24T01:28:50Z
-
-# v0.1.8 Release Notes
-
-### Build and Test Agents
-* Safety: Integrated NVIDIA as a safety provider.
-* VectorDB: Added Qdrant as an inline provider.
-* Agents: Added support for multiple tool groups in agents.
-* Agents: Simplified imports for Agents in client package
-
-
-### Agent Evals and Model Customization
-* Introduced DocVQA and IfEval benchmarks.
-
-### Deploying and Monitoring Agents
-* Introduced a Containerfile and image workflow for the Playground.
-* Implemented support for Bearer (API Key) authentication.
-* Added attribute-based access control for resources.
-* Fixes on docker deployments: use --pull always and standardized the default port to 8321
-* Deprecated: /v1/inspect/providers use /v1/providers/ instead
-
-### Better Engineering
-* Consolidated scripts under the ./scripts directory.
-* Addressed mypy violations in various modules.
-* Added Dependabot scans for Python dependencies.
-* Implemented a scheduled workflow to update the changelog automatically.
-* Enforced concurrency to reduce CI loads.
-
-
-### New Contributors
-* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
-* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
-* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
-* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
-
---
-
-# v0.1.7
-Published on: 2025-03-14T22:30:51Z
-
-## 0.1.7 Release Notes
-
-###  Build and Test Agents
-* Inference: ImageType is now refactored to LlamaStackImageType
-* Inference: Added tests to measure TTFT
-* Inference: Bring back usage metrics
-* Agents: Added endpoint for get agent, list agents and list sessions
-* Agents: Automated conversion of type hints in client tool for lite llm format
-* Agents: Deprecated ToolResponseMessage in agent.resume API
-* Added Provider API for listing and inspecting provider info
-
-### Agent Evals and Model Customization
-* Eval: Added new eval benchmarks Math 500 and BFCL v3
-* Deploy and Monitoring of Agents
-* Telemetry: Fix tracing to work across coroutines
-
-###  Better Engineering
-* Display code coverage for unit tests
-* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
-* Unit tests also run on Python 3.11, 3.12, and 3.13
-* Added ollama inference to Integration tests CI
-* Improved documentation across examples, testing, CLI, updated providers table )
-
-
-
-
---
-
-# v0.1.6
-Published on: 2025-03-08T04:35:08Z
-
-## 0.1.6 Release Notes
-
-### Build and Test Agents
-* Inference: Fixed support for inline vllm provider
-* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
-* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
-* Agent: Unify tools and Python SDK Agents API
-* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
-* Agent: Support python functions without @client_tool decorator as client tools
-* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
-* VectorIO: MilvusDB support added
-
-### Agent Evals and Model Customization
-* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
-* Eval: Documentation for eval, scoring, adding new benchmarks
-* Eval: Distribution template to run benchmarks on llama & non-llama models
-* Eval: Ability to register new custom LLM-as-judge scoring functions
-* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
-
-### Deploy and Monitoring of Agents
-* Better support for different log levels across all components for better monitoring
-
-### Better Engineering
-* Enhance OpenAPI spec to include Error types across all APIs
-* Moved all tests to /tests and created unit tests to run on each PR
-* Removed all dependencies on llama-models repo
-
-
---
-
-# v0.1.5.1
-Published on: 2025-02-28T22:37:44Z
-
-## 0.1.5.1 Release Notes
-* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
-
---
-
-# v0.1.5
-Published on: 2025-02-28T18:14:01Z
-
-## 0.1.5 Release Notes
-###  Build Agents
-* Inference: Support more non-llama models (openai, anthropic, gemini)
-* Inference: Can use the provider's model name in addition to the HF alias
-* Inference: Fixed issues with calling tools that weren't specified in the prompt
-* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
-* Embeddings: Added support for Nemo retriever embedding models
-* Tools: Added support for MCP tools in Ollama Distribution
-* Distributions: Added new Groq distribution
-
-### Customize Models
-* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
-
-### Monitor agents
-* More comprehensive logging of agent steps including client tools
-* Telemetry inputs/outputs are now structured and queryable
-* Ability to retrieve agents session, turn, step by ids
-
-### Better Engineering
-* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
-* Move most logging to use logger instead of prints
-* Completed text /chat-completion and /completion tests
-
-
---
-
-# v0.1.4
-Published on: 2025-02-25T00:02:43Z
-
-## v0.1.4 Release Notes
-Here are the key changes coming as part of this release:
-
-### Build and Test Agents
-* Inference: Added support for non-llama models
-* Inference: Added option to list all downloaded models and remove models
-* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
-* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
-* Agent: Added logging for agent step start and completion times
-* Agent: Added support for logging for tool execution metadata
-* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
-* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
-* VectorIO: Improved performance of sqlite-vec using chunked writes
-### Agent Evals and Model Customization
-* Deprecated api /eval-tasks. Use /eval/benchmark  instead
-* Added CPU training support for TorchTune
-### Deploy and Monitoring of Agents
-* Consistent view of client and server tool calls in telemetry
-### Better Engineering
-* Made tests more data-driven for consistent evaluation
-* Fixed documentation links and improved API reference generation
-* Various small fixes for build scripts and system reliability
-
-
-
---
-
-# v0.1.3
-Published on: 2025-02-14T20:24:32Z
-
-## v0.1.3 Release
-
-Here are some key changes that are coming as part of this release.
-
-### Build and Test Agents
-Streamlined the initial development experience
- Added support for  llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
-
-### Agent Evals
-Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
-
-### Deploy and Monitoring of Agents
-Improved production readiness
- Added usage metrics collection for chat completions
- CLI improvements for provider information
- Improved error handling and system reliability
- Better model endpoint handling and accessibility
- Improved signal handling on distro server
-
-### Better Engineering
-Infrastructure and code quality improvements
- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
- Added conventional commits standard
- Fixed documentation parsing issues
-
-
---
-
-# v0.1.2
-Published on: 2025-02-07T22:06:49Z
-
-# TL;DR
- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
-
-
---
-
-# v0.1.1
-Published on: 2025-02-02T02:29:24Z
-
-A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
-
-
---
-
-# v0.1.0
-Published on: 2025-01-24T17:47:47Z
-
-We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
-
-## Context
-GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
-
-Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
-
-## Release
-After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
-
-There are example standalone apps in llama-stack-apps.
-
-
-## Key Features of this release
-
- **Unified API Layer**
-  - Inference: Run LLM models
-  - RAG: Store and retrieve knowledge for RAG
-  - Agents: Build multi-step agentic workflows
-  - Tools: Register tools that can be called by the agent
-  - Safety: Apply content filtering and safety policies
-  - Evaluation: Test model and agent quality
-  - Telemetry: Collect and analyze usage data and complex agentic traces
-  - Post Training ( Coming Soon ): Fine tune models for specific use cases
-
- **Rich Provider Ecosystem**
-  - Local Development: Meta's Reference, Ollama
-  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
-  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
-  - On-device: iOS and Android support
-
- **Built for Production**
-  - Pre-packaged distributions for common deployment scenarios
-  - Backwards compatibility across model versions
-  - Comprehensive evaluation capabilities
-  - Full observability and monitoring
-
- **Multiple developer interfaces**
-  - CLI: Command line interface
-  - Python SDK
-  - Swift iOS SDK
-  - Kotlin Android SDK
-
- **Sample llama stack applications**
-  - Python
-  - iOS
-  - Android
-
-
-
---
-
-# v0.1.0rc12
-Published on: 2025-01-22T22:24:01Z
-
-
-
---
-
-# v0.0.63
-Published on: 2024-12-18T07:17:43Z
-
-A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
-
---
-
-# v0.0.62
-Published on: 2024-12-18T02:39:43Z
-
-
-
---
-
-# v0.0.61
-Published on: 2024-12-10T20:50:33Z
-
-
-
---
-
-# v0.0.55
-Published on: 2024-11-23T17:14:07Z
-
-
-
---
-
-# v0.0.54
-Published on: 2024-11-22T00:36:09Z
-
-
-
---
-
-# v0.0.53
-Published on: 2024-11-20T22:18:00Z
-
-🚀  Initial Release Notes for Llama Stack!
+## 0.0.53

 ### Added
 - Resource-oriented design for models, shields, memory banks, datasets and eval tasks
@ -477,6 +33,3 @@ Published on: 2024-11-20T22:18:00Z

 ### Removed
 - `llama stack configure` command
-
-
---
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -40,7 +40,6 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
 3. Ensure the test suite passes.
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
-6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).

 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
@ -57,88 +56,24 @@ disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.


-## Set up your development environment
-
-We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
-You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
-
-You can install the dependencies by running:
-
-```bash
-cd llama-stack
-uv sync --extra dev
-uv pip install -e .
-source .venv/bin/activate
-```
-
-> [!NOTE]
-> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory.
-> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
-> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-
-Note that you can create a dotenv file `.env` that includes necessary environment variables:
-```
-LLAMA_STACK_BASE_URL=http://localhost:8321
-LLAMA_STACK_CLIENT_LOG=debug
-LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=<provider-name>
-TAVILY_SEARCH_API_KEY=
-BRAVE_SEARCH_API_KEY=
-```
-
-And then use this dotenv file when running client SDK tests via the following:
-```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
-```
-
 ## Pre-commit Hooks

 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:

 ```bash
-uv run pre-commit install
+$ cd llama-stack
+$ conda activate <your-environment>
+$ pip install pre-commit
+$ pre-commit install
 ```

 After that, pre-commit hooks will run automatically before each commit.

-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
-
-```bash
-uv run pre-commit run --all-files
-```
-
-> [!CAUTION]
-> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-
-## Running tests
-
-You can find the Llama Stack testing documentation here [here](tests/README.md).
-
-## Adding a new dependency to the project
-
-To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
-
-```bash
-uv add foo
-uv sync
-```

 ## Coding Style
-
-* Comments should provide meaningful insights into the code. Avoid filler comments that simply
-  describe the next step, as they create unnecessary clutter, same goes for docstrings.
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
-  rather than explain what the next line of code does.
-* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
-  `Exception`.
-* Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tab
-* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
-  justification for bypassing the check.
-* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
-  justification for bypassing the check.
-* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
-  readability reasons.
+* 2 spaces for indentation rather than tabs
+* 80 character line length
+* ...

 ## Common Tasks

@ -146,43 +81,36 @@ Some tips about common tasks you work on while contributing to Llama Stack:

 ### Using `llama stack build`

-Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+Building a stack image (conda / docker) will use the production version of the `llama-stack`, `llama-models` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_MODELS_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.

 Example:
 ```bash
-cd work/
-git clone https://github.com/meta-llama/llama-stack.git
-git clone https://github.com/meta-llama/llama-stack-client-python.git
-cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
+$ cd work/
+$ git clone https://github.com/meta-llama/llama-stack.git
+$ git clone https://github.com/meta-llama/llama-models.git
+$ cd llama-stack
+$ LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template <...>
 ```


 ### Updating Provider Configurations

-If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
+If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.

 ### Building the Documentation

 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
-# This rebuilds the documentation pages.
-uv run --group docs make -C docs/ html
+cd llama-stack/docs
+pip install -r requirements.txt
+pip install sphinx-autobuild

 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+make html
+sphinx-autobuild source build/html
 ```

-### Update API Documentation
-
-If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
-
-```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
-```
-
-The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.

 ## License
 By contributing to Llama, you agree that your contributions will be licensed
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,5 @@
-include pyproject.toml
-include llama_stack/models/llama/llama3/tokenizer.model
-include llama_stack/models/llama/llama4/tokenizer.model
+include requirements.txt
+include distributions/dependencies.json
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
-include llama_stack/models/llama/*/*.md
-include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -2,91 +2,17 @@

 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
-[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
-[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
-[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
+[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)

-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-
-<details>
-
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-
-\
-*Note you need 8xH100 GPU-host to run these models*
-
-```bash
-pip install -U llama_stack
-
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
-
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
-
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-
-ChatCompletionResponse(
-    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
-    logprobs=None,
-    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-
-print(f"User> {prompt}")
-response = client.inference.chat_completion(
-    model_id=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.completion_message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-
-
-</details>
-
-### 🚀 One-Line Installer 🚀
-
-To try Llama Stack locally, run:
-
-```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
-```
-
-### Overview
-
-Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
+Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides

 - **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
- **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
+- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
+- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
+- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
+- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack

 <div style="text-align: center;">
  <img
@ -98,39 +24,31 @@ Llama Stack standardizes the core building blocks that simplify AI application d
 </div>

 ### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
- **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
+- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice.
+- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
 - **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.

 By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.

 ### API Providers
-Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-
-| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
-|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
-|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
-|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
-|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
-|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
-|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
-|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
-|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
-|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
-|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
-|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
-|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
-|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
-|          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
-|        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
-|         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
-|       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
+Here is a list of the various API providers and available distributions to developers started easily,

+|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
+|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
+|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
+|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
+|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
+|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
+|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
+|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
+|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
+|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |

 ### Distributions

@ -139,6 +57,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
+|           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
@ -147,16 +66,39 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
 | vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |

+### Installation
+
+You have two ways to install this repository:
+
+1. **Install as a package**:
+   You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
+   ```bash
+   pip install llama-stack
+   ```
+
+2. **Install from source**:
+   If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable).
+   Then, follow these steps:
+   ```bash
+    mkdir -p ~/local
+    cd ~/local
+    git clone git@github.com:meta-llama/llama-stack.git
+
+    conda create -n stack python=3.10
+    conda activate stack
+
+    cd llama-stack
+    pip install -e .
+   ```

 ### Documentation

 Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.

-* CLI references
-    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
-    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
-* Getting Started
-    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
+* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html)
+    * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
+* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
+    * Quick guide to start a Llama Stack server.
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
@ -169,9 +111,9 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
 | Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
-| Typescript   | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
+| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
 | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)

-Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
+Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.

 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -0,0 +1,15 @@
+services:
+  llamastack:
+    image: distribution-bedrock
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-bedrock.yaml
+    ports:
+      - "8321:8321"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -0,0 +1,16 @@
+services:
+  llamastack:
+    image: llamastack/distribution-cerebras
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-cerebras.yaml
+    ports:
+      - "8321:8321"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -0,0 +1,50 @@
+services:
+  text-generation-inference:
+    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - "5009:5009"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
+      - NUM_SHARD=4
+      - MAX_BATCH_PREFILL_TOKENS=32768
+      - MAX_INPUT_TOKENS=8000
+      - MAX_TOTAL_TOKENS=8192
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: all
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+  llamastack:
+    depends_on:
+      text-generation-inference:
+        condition: service_healthy
+    image: llamastack/distribution-tgi
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to TGI run.yaml file
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "8321:8321"
+    # Hack: wait for TGI server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    restart_policy:
+      condition: on-failure
+      delay: 3s
+      max_attempts: 5
+      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -0,0 +1,44 @@
+version: '2'
+image_name: local
+container_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: tgi0
+    provider_type: remote::tgi
+    config:
+      url: http://127.0.0.1:80
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  - provider_id: meta1
+    provider_type: inline::prompt-guard
+    config:
+      model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::faiss
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -0,0 +1,487 @@
+{
+  "sambanova": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-serverless": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "together": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "remote-vllm": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "fireworks": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "tgi": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "bedrock": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "boto3",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "meta-reference-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentence-transformers",
+    "sentencepiece",
+    "torch",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "nvidia": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "meta-reference-quantized-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fbgemm-gpu",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentence-transformers",
+    "sentencepiece",
+    "torch",
+    "torchao==0.5.0",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "cerebras": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "cerebras_cloud_sdk",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "ollama": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "ollama",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-endpoint": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ]
+}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -0,0 +1,14 @@
+services:
+  llamastack:
+    image: llamastack/distribution-fireworks
+    ports:
+      - "8321:8321"
+    environment:
+      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -0,0 +1,34 @@
+services:
+  llamastack:
+    image: llamastack/distribution-meta-reference-gpu
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "8321:8321"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -0,0 +1,35 @@
+services:
+  llamastack:
+    image: llamastack/distribution-meta-reference-quantized-gpu
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "8321:8321"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -0,0 +1,58 @@
+version: '2'
+image_name: local
+container_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: meta0
+    provider_type: inline::meta-reference-quantized
+    config:
+      model: Llama3.2-3B-Instruct:int4-qlora-eo8
+      quantization:
+        type: int4
+      torch_seed: null
+      max_seq_len: 2048
+      max_batch_size: 1
+  - provider_id: meta1
+    provider_type: inline::meta-reference-quantized
+    config:
+      # not a quantized model !
+      model: Llama-Guard-3-1B
+      quantization: null
+      torch_seed: null
+      max_seq_len: 2048
+      max_batch_size: 1
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  - provider_id: meta1
+    provider_type: inline::prompt-guard
+    config:
+      model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -0,0 +1,71 @@
+services:
+  ollama:
+    image: ollama/ollama:latest
+    network_mode: ${NETWORK_MODE:-bridge}
+    volumes:
+      - ~/.ollama:/root/.ollama
+    ports:
+      - "11434:11434"
+    environment:
+      OLLAMA_DEBUG: 1
+    command: []
+    deploy:
+      resources:
+        limits:
+          memory: 8G    # Set maximum memory
+        reservations:
+          memory: 8G    # Set minimum memory reservation
+    # healthcheck:
+    #   # ugh, no CURL in ollama image
+    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
+    #   interval: 10s
+    #   timeout: 5s
+    #   retries: 5
+
+  ollama-init:
+    image: ollama/ollama:latest
+    depends_on:
+      - ollama
+        # condition: service_healthy
+    network_mode: ${NETWORK_MODE:-bridge}
+    environment:
+      - OLLAMA_HOST=ollama
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+    volumes:
+      - ~/.ollama:/root/.ollama
+      - ./pull-models.sh:/pull-models.sh
+    entrypoint: ["/pull-models.sh"]
+
+  llamastack:
+    depends_on:
+      ollama:
+        condition: service_started
+      ollama-init:
+        condition: service_started
+    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
+    network_mode: ${NETWORK_MODE:-bridge}
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to ollama run.yaml file
+      - ~/local/llama-stack/:/app/llama-stack-source
+      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+      - OLLAMA_URL=http://ollama:11434
+    entrypoint: >
+        python -m llama_stack.distribution.server.server /root/my-run.yaml \
+        --port ${LLAMA_STACK_PORT:-5001}
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+        max_attempts: 3
+        window: 60s
+volumes:
+  ollama:
+  ollama-init:
+  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -0,0 +1,18 @@
+#!/bin/sh
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
+for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
+  echo "Preloading $model..."
+  if ! ollama run "$model"; then
+    echo "Failed to pull and run $model"
+    exit 1
+  fi
+done
+
+echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run.yaml
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -0,0 +1,19 @@
+services:
+  llamastack:
+    image: distribution-nvidia:dev
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-nvidia.yaml
+    ports:
+      - "8321:8321"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -0,0 +1,100 @@
+services:
+  vllm-inference:
+    image: vllm/vllm-openai:latest
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
+      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
+    image: vllm/vllm-openai:latest
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
+      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_SAFETY_MODEL}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+  llamastack:
+    depends_on:
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
+    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
+    environment:
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - MAX_TOKENS=${MAX_TOKENS:-4096}
+      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    # Hack: wait for vLLM server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
+volumes:
+  vllm-inference:
+  vllm-safety:
+  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -0,0 +1,9 @@
+name: runpod
+distribution_spec:
+  description: Use Runpod for running LLM inference
+  providers:
+    inference: remote::runpod
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -0,0 +1,16 @@
+services:
+  llamastack:
+    image: llamastack/distribution-sambanova
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-sambanova.yaml
+    ports:
+      - "5000:5000"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -0,0 +1,103 @@
+services:
+  tgi-inference:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_INFERENCE_PORT:-8080}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  tgi-${TGI_SAFETY_MODEL:+safety}:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+      --port ${TGI_SAFETY_PORT:-8081}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  llamastack:
+    depends_on:
+      tgi-inference:
+        condition: service_healthy
+      tgi-${TGI_SAFETY_MODEL:+safety}:
+        condition: service_healthy
+    image: llamastack/distribution-tgi:test-0.0.52rc3
+    network_mode: ${NETWORK_MODE:-bridged}
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    # Hack: wait for TGI server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    restart_policy:
+      condition: on-failure
+      delay: 3s
+      max_attempts: 5
+      window: 60s
+    environment:
+      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
+      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+volumes:
+  tgi-inference:
+  tgi-safety:
+  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@ -0,0 +1,65 @@
+# Together Distribution
+
+### Connect to a Llama Stack Together Endpoint
+- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
+
+The `llamastack/distribution-together` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| remote::together   	| meta-reference 	| meta-reference, remote::weaviate 	| meta-reference 	| meta-reference 	|
+
+
+### Docker: Start the Distribution (Single Node CPU)
+
+> [!NOTE]
+> This assumes you have an hosted endpoint at Together with API Key.
+
+```
+$ cd distributions/together
+$ ls
+compose.yaml  run.yaml
+$ docker compose up
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
+```
+inference:
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: <optional api key>
+```
+
+### Conda llama stack run (Single Node CPU)
+
+```bash
+llama stack build --template together --image-type conda
+# -- modify run.yaml to a valid Together server endpoint
+llama stack run ./run.yaml
+```
+
+### (Optional) Update Model Serving Configuration
+
+Use `llama-stack-client models list` to check the available models served by together.
+
+```
+$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
+| identifier                   | llama_model                  | provider_id   | metadata   |
+==============================+==============================+===============+============+
+| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+```
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -0,0 +1,14 @@
+services:
+  llamastack:
+    image: llamastack/distribution-together
+    ports:
+      - "8321:8321"
+    environment:
+      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -0,0 +1,35 @@
+services:
+  llamastack:
+    image: llamastack/distribution-inline-vllm
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "8321:8321"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -0,0 +1,66 @@
+version: '2'
+image_name: local
+container_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: inline::vllm
+    config:
+      model: Llama3.2-3B-Instruct
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.4
+      enforce_eager: true
+      max_tokens: 4096
+  - provider_id: vllm-inference-safety
+    provider_type: inline::vllm
+    config:
+      model: Llama-Guard-3-1B
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.2
+      enforce_eager: true
+      max_tokens: 4096
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  # Uncomment to use prompt guard
+  # - provider_id: meta1
+  #   provider_type: inline::prompt-guard
+  #   config:
+  #     model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+  # Uncomment to use pgvector
+  # - provider_id: pgvector
+  #   provider_type: remote::pgvector
+  #   config:
+  #     host: 127.0.0.1
+  #     port: 5432
+  #     db: postgres
+  #     user: postgres
+  #     password: mysecretpassword
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/agents_store.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -12,24 +12,3 @@
 .wy-side-nav-search {
    background-color: transparent !important;
 }
-
-.hide-title h1 {
-    display: none;
-}
-
-h2, h3, h4 {
-    font-weight: normal;
-}
-html[data-theme="dark"] .rst-content div[class^="highlight"] {
-  background-color: #0b0b0b;
-}
-pre {
-    white-space: pre-wrap !important;
-    word-break: break-all;
-}
-
-[data-theme="dark"] .mermaid {
-    background-color: #f4f4f6 !important;
-    border-radius: 6px;
-    padding: 0.5em;
-  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
-document.addEventListener("DOMContentLoaded", function () {
-  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-  const htmlElement = document.documentElement;
-
-  // Check if theme is saved in localStorage
-  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
-
-  if (savedTheme) {
-    // Use the saved theme preference
-    htmlElement.setAttribute("data-theme", savedTheme);
-    document.body.classList.toggle("dark", savedTheme === "dark");
-  } else {
-    // Fall back to system preference
-    const theme = prefersDark ? "dark" : "light";
-    htmlElement.setAttribute("data-theme", theme);
-    document.body.classList.toggle("dark", theme === "dark");
-    // Save initial preference
-    localStorage.setItem("sphinx-rtd-theme", theme);
-  }
-
-  // Listen for theme changes from the existing toggle
-  const observer = new MutationObserver(function(mutations) {
-    mutations.forEach(function(mutation) {
-      if (mutation.attributeName === "data-theme") {
-        const currentTheme = htmlElement.getAttribute("data-theme");
-        localStorage.setItem("sphinx-rtd-theme", currentTheme);
-      }
-    });
-  });
-
-  observer.observe(htmlElement, { attributes: true });
-});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import time
-
-
-def pytest_collection_modifyitems(items):
-    for item in items:
-        item.name = item.name.replace(' ', '_') 
-
-
-def pytest_runtest_teardown(item):
-    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
-    if interval_seconds:
-        time.sleep(float(interval_seconds))
-
-
-def pytest_configure(config):
-    config.option.tbstyle = "short"
-    config.option.disable_warnings = True
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/bedrock/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/cerebras/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/cerebras/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/fireworks/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/meta-reference-gpu/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/ollama/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/nvidia/build.yaml`