2025-08-21 17:33:12 +00:00
1583 changed files with 43995 additions and 586722 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -1,12 +0,0 @@
-[run]
-omit =
-    */tests/*
-    */llama_stack/providers/*
-    */llama_stack/templates/*
-    .venv/*
-    */llama_stack/cli/scripts/*
-    */llama_stack/ui/*
-    */llama_stack/distribution/ui/*
-    */llama_stack/strong_typing/*
-    */llama_stack/env.py
-    */__init__.py
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,31 @@
+[flake8]
+# Suggested config from pytorch that we can adapt
+select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+# N812 ignored because import torch.nn.functional as F is PyTorch convention
+# N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
+# E731 allow usage of assigning lambda expressions
+# E701 let black auto-format statements on one line
+# E704 let black auto-format statements on one line
+ignore =
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704
+    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
+    # to line this up with executable bit
+    EXE001,
+    # random naming hints don't need
+    N802,
+    # these ignores are from flake8-bugbear; please fix!
+    B007,B008,B950
+optional-ascii-coding = True
+exclude =
+    ./.git,
+    ./docs/*,
+    ./build,
+    ./scripts,
+    ./venv,
+    *.pyi,
+    .pre-commit-config.yaml,
+    *.md,
+    .flake8
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham
--- a/.github/ISSUE_TEMPLATE/bug.yml
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@ -1,6 +1,6 @@
 name: 🐛 Bug Report
 description: Create a report to help us reproduce and fix the bug
-labels: ["bug"]
+
 body:
  - type: markdown
    attributes:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,12 +0,0 @@
-blank_issues_enabled: false
-
-contact_links:
-  - name: Have you read the docs?
-    url: https://llama-stack.readthedocs.io/en/latest/index.html
-    about: Much help can be found in the docs
-  - name: Start a discussion
-    url: https://github.com/meta-llama/llama-stack/discussions/new
-    about: Start a discussion on a topic
-  - name: Chat on Discord
-    url: https://discord.gg/llama-stack
-    about: Maybe chatting with the community can help
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -1,28 +1,31 @@
 name: 🚀 Feature request
-description: Request a new llama-stack feature
-labels: ["enhancement"]
+description: Submit a proposal/request for a new llama-stack feature
+
 body:
 - type: textarea
  id: feature-pitch
  attributes:
-    label: 🚀 Describe the new functionality needed
+    label: 🚀 The feature, motivation and pitch
    description: >
-      A clear and concise description of _what_ needs to be built.
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
  validations:
    required: true

 - type: textarea
-  id: feature-motivation
+  id: alternatives
  attributes:
-    label: 💡 Why is this needed? What if we don't build it?
+    label: Alternatives
    description: >
-      A clear and concise description of _why_ this functionality is needed.
-  validations:
-    required: true
+      A description of any alternative solutions or features you've considered, if any.

 - type: textarea
-  id: other-thoughts
+  id: additional-context
  attributes:
-    label: Other thoughts
+    label: Additional context
    description: >
-      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
+      Add any other context or screenshots about the feature request.
+
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/tech-debt.yml
+++ b/.github/ISSUE_TEMPLATE/tech-debt.yml
@ -1,30 +0,0 @@
-name: 🔧 Tech Debt
-description: Something that is functional but should be improved or optimizied
-labels: ["tech-debt"]
-body:
- type: textarea
-  id: tech-debt-explanation
-  attributes:
-    label: 🤔 What is the technical debt you think should be addressed?
-    description: >
-      A clear and concise description of _what_ needs to be addressed - ensure you are describing
-      constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
-      or feature request.
-  validations:
-    required: true
-
- type: textarea
-  id: tech-debt-motivation
-  attributes:
-    label: 💡 What is the benefit of addressing this technical debt?
-    description: >
-      A clear and concise description of _why_ this work is needed.
-  validations:
-    required: true
-
- type: textarea
-  id: other-thoughts
-  attributes:
-    label: Other thoughts
-    description: >
-      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,8 +1,27 @@
 # What does this PR do?
-<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->

-<!-- If resolving an issue, uncomment and update the line below -->
-<!-- Closes #[issue-number] -->
+In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue.
+
+- [ ] Addresses issue (#issue)
+

 ## Test Plan
-<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
+
+Please describe:
+ - tests you ran to verify your changes with result summaries.
+ - provide instructions so it can be reproduced.
+
+
+## Sources
+
+Please link relevant resources if necessary.
+
+
+## Before submitting
+
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Ran pre-commit to handle lint / formatting issues.
+- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
+      Pull Request section?
+- [ ] Updated relevant documentation.
+- [ ] Wrote necessary unit or integration tests.
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +0,0 @@
-# This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -1,88 +0,0 @@
-name: 'Run and Record Tests'
-description: 'Run integration tests and handle recording/artifact upload'
-
-inputs:
-  test-subdirs:
-    description: 'Comma-separated list of test subdirectories to run'
-    required: true
-  test-pattern:
-    description: 'Regex pattern to pass to pytest -k'
-    required: false
-    default: ''
-  stack-config:
-    description: 'Stack configuration to use'
-    required: true
-  provider:
-    description: 'Provider to use for tests'
-    required: true
-  inference-mode:
-    description: 'Inference mode (record or replay)'
-    required: true
-  run-vision-tests:
-    description: 'Whether to run vision tests'
-    required: false
-    default: 'false'
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Check Storage and Memory Available Before Tests
-      if: ${{ always() }}
-      shell: bash
-      run: |
-        free -h
-        df -h
-
-    - name: Run Integration Tests
-      shell: bash
-      run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
-          --stack-config '${{ inputs.stack-config }}' \
-          --provider '${{ inputs.provider }}' \
-          --test-subdirs '${{ inputs.test-subdirs }}' \
-          --test-pattern '${{ inputs.test-pattern }}' \
-          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
-          | tee pytest-${{ inputs.inference-mode }}.log
-
-
-    - name: Commit and push recordings
-      if: ${{ inputs.inference-mode == 'record' }}
-      shell: bash
-      run: |
-        echo "Checking for recording changes"
-        git status --porcelain tests/integration/recordings/
-
-        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
-          echo "New recordings detected, committing and pushing"
-          git add tests/integration/recordings/
-
-          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-            git commit -m "Recordings update from CI (vision)"
-          else
-            git commit -m "Recordings update from CI"
-          fi
-
-          git fetch origin ${{ github.ref_name }}
-          git rebase origin/${{ github.ref_name }}
-          echo "Rebased successfully"
-          git push origin HEAD:${{ github.ref_name }}
-          echo "Pushed successfully"
-        else
-          echo "No recording changes"
-        fi
-
-    - name: Write inference logs to file
-      if: ${{ always() }}
-      shell: bash
-      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
-
-    - name: Upload logs
-      if: ${{ always() }}
-      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-      with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
-        path: |
-          *.log
-        retention-days: 1
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,23 +0,0 @@
-name: Setup Ollama
-description: Start Ollama
-inputs:
-  run-vision-tests:
-    description: 'Run vision tests: "true" or "false"'
-    required: false
-    default: 'false'
-runs:
-  using: "composite"
-  steps:
-    - name: Start Ollama
-      shell: bash
-      run: |
-        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-          image="ollama-with-vision-model"
-        else
-          image="ollama-with-models"
-        fi
-
-        echo "Starting Ollama with image: $image"
-        docker run -d --name ollama -p 11434:11434 docker.io/llamastack/$image
-        echo "Verifying Ollama status..."
-        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,43 +0,0 @@
-name: Setup runner
-description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
-inputs:
-  python-version:
-    description: The Python version to use
-    required: false
-    default: "3.12"
-  client-version:
-    description: The llama-stack-client-python version to test against (latest or published)
-    required: false
-    default: "latest"
-runs:
-  using: "composite"
-  steps:
-    - name: Install uv
-      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-      with:
-        python-version: ${{ inputs.python-version }}
-        version: 0.7.6
-
-    - name: Install dependencies
-      shell: bash
-      run: |
-        echo "Updating project dependencies via uv sync"
-        uv sync --all-groups
-
-        echo "Installing ad-hoc dependencies"
-        uv pip install faiss-cpu
-
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          uv pip install llama-stack-client
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
-        echo "Installed llama packages"
-        uv pip list | grep llama
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -1,66 +0,0 @@
-name: 'Setup Test Environment'
-description: 'Common setup steps for integration tests including dependencies, providers, and build'
-
-inputs:
-  python-version:
-    description: 'Python version to use'
-    required: true
-  client-version:
-    description: 'Client version (latest or published)'
-    required: true
-  provider:
-    description: 'Provider to setup (ollama or vllm)'
-    required: true
-    default: 'ollama'
-  run-vision-tests:
-    description: 'Whether to setup provider for vision tests'
-    required: false
-    default: 'false'
-  inference-mode:
-    description: 'Inference mode (record or replay)'
-    required: true
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Install dependencies
-      uses: ./.github/actions/setup-runner
-      with:
-        python-version: ${{ inputs.python-version }}
-        client-version: ${{ inputs.client-version }}
-
-    - name: Setup ollama
-      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
-      uses: ./.github/actions/setup-ollama
-      with:
-        run-vision-tests: ${{ inputs.run-vision-tests }}
-
-    - name: Setup vllm
-      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
-      uses: ./.github/actions/setup-vllm
-
-    - name: Build Llama Stack
-      shell: bash
-      run: |
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          unset LLAMA_STACK_CLIENT_DIR
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
-        echo "Building Llama Stack"
-
-        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
-
-    - name: Configure git for commits
-      shell: bash
-      run: |
-        git config --local user.email "github-actions[bot]@users.noreply.github.com"
-        git config --local user.name "github-actions[bot]"
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -1,27 +0,0 @@
-name: Setup VLLM
-description: Start VLLM
-runs:
-  using: "composite"
-  steps:
-    - name: Start VLLM
-      shell: bash
-      run: |
-        # Start vllm container
-        docker run -d \
-          --name vllm \
-          -p 8000:8000 \
-          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
-          --host 0.0.0.0 \
-          --port 8000 \
-          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
-
-          # Wait for vllm to be ready
-          echo "Waiting for vllm to be ready..."
-          timeout 900 bash -c 'until curl -f http://localhost:8000/health; do
-            echo "Waiting for vllm..."
-            sleep 5
-          done'
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,33 +0,0 @@
-# GitHub Dependabot configuration
-version: 2
-updates:
-  # Enable version updates for GitHub Actions
-  - package-ecosystem: "github-actions"
-    directory: "/" # Will use the default workflow location of `.github/workflows`
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    commit-message:
-      prefix: chore(github-deps)
-
-  - package-ecosystem: "uv"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    labels:
-      - type/dependencies
-      - python
-    commit-message:
-      prefix: chore(python-deps)
-
-  - package-ecosystem: npm
-    directory: "/llama_stack/ui"
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    labels:
-      - type/dependencies
-      - javascript
-    commit-message:
-      prefix: chore(ui-deps)
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -1,23 +0,0 @@
-# Llama Stack CI
-
-Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
-
-| Name | File | Purpose |
-| ---- | ---- | ------- |
-| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
-| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
-| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
-| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
-| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
-| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
-| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
-| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
-| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
-| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
-| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
-| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
-| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
-| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
-| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
-| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -1,31 +0,0 @@
-name: Update Changelog
-
-run-name: Creates PR for updating the CHANGELOG.md
-
-on:
-  release:
-    types: [published, unpublished, created, edited, deleted, released]
-
-permissions:
-  contents: read
-
-jobs:
-  generate_changelog:
-    name: Generate changelog
-    permissions:
-      contents: write  # for peter-evans/create-pull-request to create branch
-      pull-requests: write  # for peter-evans/create-pull-request to create a PR
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          ref: main
-          fetch-depth: 0
-      - run: |
-          python ./scripts/gen-changelog.py
-      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
-        with:
-          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          branch: create-pull-request/changelog
-          signoff: true
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -1,39 +0,0 @@
-name: Installer CI
-
-run-name: Test the installation script
-
-on:
-  pull_request:
-    paths:
-      - 'scripts/install.sh'
-  push:
-    paths:
-      - 'scripts/install.sh'
-  schedule:
-    - cron: '0 2 * * *'  # every day at 02:00 UTC
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
-      - name: Run ShellCheck on install.sh
-        run: shellcheck scripts/install.sh
-  smoke-test-on-dev:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
-            llama stack build --template starter --image-type container --image-name test
-
-      - name: Run installer end-to-end
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          ./scripts/install.sh --image $IMAGE_ID
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -1,112 +0,0 @@
-name: Integration Auth Tests
-
-run-name: Run the integration test suite with Kubernetes authentication
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'distributions/**'
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-auth-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        auth-provider: [oauth2_token]
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Install minikube
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20
-
-      - name: Start minikube
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          minikube start
-          kubectl get pods -A
-
-      - name: Configure Kube Auth
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          kubectl create namespace llama-stack
-          kubectl create serviceaccount llama-stack-auth -n llama-stack
-          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
-
-      - name: Set Kubernetes Config
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
-          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
-          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
-          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
-          echo "TOKEN=$(cat llama-stack-auth-token)" >> $GITHUB_ENV
-
-      - name: Set Kube Auth Config and run server
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          run_dir=$(mktemp -d)
-          cat <<'EOF' > $run_dir/run.yaml
-          version: '2'
-          image_name: kube
-          apis: []
-          providers: {}
-          server:
-            port: 8321
-          EOF
-          yq eval '.server.auth.provider_config.type = "${{ matrix.auth-provider }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.tls_cafile = "${{ env.KUBERNETES_CA_CERT_PATH }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.issuer = "${{ env.KUBERNETES_ISSUER }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.audience = "${{ env.KUBERNETES_AUDIENCE }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.jwks.uri = "${{ env.KUBERNETES_API_SERVER_URL }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
-          cat $run_dir/run.yaml
-
-          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
-                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
-                exit 0
-              else
-                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
-                cat server.log
-                exit 1
-              fi
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Test auth
-        run: |
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -1,72 +0,0 @@
-name: SqlStore Integration Tests
-
-run-name: Run the integration test suite with SqlStore
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/providers/utils/sqlstore/**'
-      - 'tests/integration/sqlstore/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-sql-store-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12", "3.13"]
-      fail-fast: false
-
-    services:
-      postgres:
-        image: postgres:15
-        env:
-          POSTGRES_USER: llamastack
-          POSTGRES_PASSWORD: llamastack
-          POSTGRES_DB: llamastack
-        ports:
-          - 5432:5432
-        options: >-
-          --health-cmd pg_isready
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Run SqlStore Integration Tests
-        env:
-          ENABLE_POSTGRES_TESTS: "true"
-          POSTGRES_HOST: localhost
-          POSTGRES_PORT: 5432
-          POSTGRES_DB: llamastack
-          POSTGRES_USER: llamastack
-          POSTGRES_PASSWORD: llamastack
-        run: |
-          uv run pytest -sv tests/integration/providers/utils/sqlstore/
-
-      - name: Upload test logs
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,87 +0,0 @@
-name: Integration Tests (Replay)
-
-run-name: Run the integration test suite from tests/integration in replay mode
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    types: [opened, synchronize, reopened]
-    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - '.github/workflows/integration-tests.yml' # This workflow
-      - '.github/actions/setup-ollama/action.yml'
-      - '.github/actions/setup-test-environment/action.yml'
-      - '.github/actions/run-and-record-tests/action.yml'
-  schedule:
-    # If changing the cron schedule, update the provider in the test-matrix job
-    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
-  workflow_dispatch:
-    inputs:
-      test-all-client-versions:
-        description: 'Test against both the latest and published versions'
-        type: boolean
-        default: false
-      test-provider:
-        description: 'Test against a specific provider'
-        type: string
-        default: 'ollama'
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''
-
-concurrency:
-  # Skip concurrency for pushes to main - each commit should be tested independently
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-
-  run-replay-mode-tests:
-    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
-        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
-        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        run-vision-tests: [true, false]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Setup test environment
-        uses: ./.github/actions/setup-test-environment
-        with:
-          python-version: ${{ matrix.python-version }}
-          client-version: ${{ matrix.client-version }}
-          provider: ${{ matrix.provider }}
-          run-vision-tests: ${{ matrix.run-vision-tests }}
-          inference-mode: 'replay'
-
-      - name: Run tests
-        uses: ./.github/actions/run-and-record-tests
-        with:
-          test-subdirs: ${{ inputs.test-subdirs }}
-          test-pattern: ${{ inputs.test-pattern }}
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          provider: ${{ matrix.provider }}
-          inference-mode: 'replay'
-          run-vision-tests: ${{ matrix.run-vision-tests }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -1,203 +0,0 @@
-name: Vector IO Integration Tests
-
-run-name: Run the integration test suite with various VectorIO providers
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/integration/vector_io/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-vector-io-tests.yml' # This workflow
-  schedule:
-    - cron: '0 0 * * *'  # (test on python 3.13) Daily at 12 AM UTC
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Setup Chroma
-        if: matrix.vector-io-provider == 'remote::chromadb'
-        run: |
-          docker run --rm -d --pull always \
-            --name chromadb \
-            -p 8000:8000 \
-            -v ~/chroma:/chroma/chroma \
-            -e IS_PERSISTENT=TRUE \
-            -e ANONYMIZED_TELEMETRY=FALSE \
-            chromadb/chroma:latest
-
-      - name: Setup Weaviate
-        if: matrix.vector-io-provider == 'remote::weaviate'
-        run: |
-          docker run --rm -d --pull always \
-          --name weaviate \
-          -p 8080:8080 -p 50051:50051 \
-          cr.weaviate.io/semitechnologies/weaviate:1.32.0
-
-      - name: Start PGVector DB
-        if: matrix.vector-io-provider == 'remote::pgvector'
-        run: |
-          docker run -d \
-            --name pgvector \
-            -e POSTGRES_USER=llamastack \
-            -e POSTGRES_PASSWORD=llamastack \
-            -e POSTGRES_DB=llamastack \
-            -p 5432:5432 \
-            pgvector/pgvector:pg17
-
-      - name: Wait for PGVector to be ready
-        if: matrix.vector-io-provider == 'remote::pgvector'
-        run: |
-          echo "Waiting for Postgres to be ready..."
-          for i in {1..30}; do
-            if docker exec pgvector pg_isready -U llamastack > /dev/null 2>&1; then
-              echo "Postgres is ready!"
-              break
-            fi
-            echo "Not ready yet... ($i)"
-            sleep 1
-          done
-
-      - name: Enable pgvector extension
-        if: matrix.vector-io-provider == 'remote::pgvector'
-        run: |
-          PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
-            -c "CREATE EXTENSION IF NOT EXISTS vector;"
-
-      - name: Setup Qdrant
-        if: matrix.vector-io-provider == 'remote::qdrant'
-        run: |
-          docker run --rm -d --pull always \
-            --name qdrant \
-            -p 6333:6333 \
-            qdrant/qdrant
-
-      - name: Wait for Qdrant to be ready
-        if: matrix.vector-io-provider == 'remote::qdrant'
-        run: |
-          echo "Waiting for Qdrant to be ready..."
-          for i in {1..30}; do
-            if curl -s http://localhost:6333/collections | grep -q '"status":"ok"'; then
-              echo "Qdrant is ready!"
-              exit 0
-            fi
-            sleep 2
-          done
-          echo "Qdrant failed to start"
-          docker logs qdrant
-          exit 1
-
-      - name: Wait for ChromaDB to be ready
-        if: matrix.vector-io-provider == 'remote::chromadb'
-        run: |
-          echo "Waiting for ChromaDB to be ready..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8000/api/v2/heartbeat | grep -q "nanosecond heartbeat"; then
-              echo "ChromaDB is ready!"
-              exit 0
-            fi
-            sleep 2
-          done
-          echo "ChromaDB failed to start"
-          docker logs chromadb
-          exit 1
-
-      - name: Wait for Weaviate to be ready
-        if: matrix.vector-io-provider == 'remote::weaviate'
-        run: |
-          echo "Waiting for Weaviate to be ready..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8080 | grep -q "https://weaviate.io/developers/weaviate/current/"; then
-              echo "Weaviate is ready!"
-              exit 0
-            fi
-            sleep 2
-          done
-          echo "Weaviate failed to start"
-          docker logs weaviate
-          exit 1
-
-      - name: Build Llama Stack
-        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
-
-      - name: Check Storage and Memory Available Before Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Run Vector IO Integration Tests
-        env:
-          ENABLE_CHROMADB: ${{ matrix.vector-io-provider == 'remote::chromadb' && 'true' || '' }}
-          CHROMADB_URL: ${{ matrix.vector-io-provider == 'remote::chromadb' && 'http://localhost:8000' || '' }}
-          ENABLE_PGVECTOR: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'true' || '' }}
-          PGVECTOR_HOST: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'localhost' || '' }}
-          PGVECTOR_PORT: ${{ matrix.vector-io-provider == 'remote::pgvector' && '5432' || '' }}
-          PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
-          PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
-          PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
-          ENABLE_QDRANT: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'true' || '' }}
-          QDRANT_URL: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'http://localhost:6333' || '' }}
-          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
-          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
-        run: |
-          uv run --no-sync \
-            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io \
-            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
-
-      - name: Check Storage and Memory Available After Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Create sanitized provider name
-        if: ${{ always() }}
-        run: |
-          echo "SANITIZED_PROVIDER=$(echo "${{ matrix.vector-io-provider }}" | tr ':' '_')" >> $GITHUB_ENV
-
-      - name: Write ChromaDB logs to file
-        if: ${{ always() && matrix.vector-io-provider == 'remote::chromadb' }}
-        run: |
-          docker logs chromadb > chromadb.log
-
-      - name: Write Qdrant logs to file
-        if: ${{ always() && matrix.vector-io-provider == 'remote::qdrant' }}
-        run: |
-          docker logs qdrant > qdrant.log
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: vector-io-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ env.SANITIZED_PROVIDER }}-${{ matrix.python-version }}
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,100 +1,25 @@
 name: Pre-commit

-run-name: Run pre-commit checks
-
 on:
  pull_request:
  push:
    branches: [main]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write

    steps:
      - name: Checkout code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          # For dependabot PRs, we need to checkout with a token that can push changes
-          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
-          # Fetch full history for dependabot PRs to allow commits
-          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0

      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
        with:
-          python-version: '3.12'
+          python-version: '3.11'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml

-      # npm ci may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-
-      # - name: Set up Node.js
-      #   uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
-      #   with:
-      #     node-version: '20'
-      #     cache: 'npm'
-      #     cache-dependency-path: 'llama_stack/ui/'
-
-      # - name: Install npm dependencies
-      #   run: npm ci
-      #   working-directory: llama_stack/ui
-
-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Debug
-        run: |
-          echo "github.ref: ${{ github.ref }}"
-          echo "github.actor: ${{ github.actor }}"
-
-      - name: Commit changes for dependabot PRs
-        if: github.actor == 'dependabot[bot]'
-        run: |
-          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
-            git config --local user.email "github-actions[bot]@users.noreply.github.com"
-            git config --local user.name "github-actions[bot]"
-
-            # Ensure we're on the correct branch
-            git checkout -B ${{ github.head_ref }}
-            git add -A
-            git commit -m "Apply pre-commit fixes"
-
-            # Pull latest changes from the PR branch and rebase our commit on top
-            git pull --rebase origin ${{ github.head_ref }}
-
-            # Push to the PR branch
-            git push origin ${{ github.head_ref }}
-            echo "Pre-commit fixes committed and pushed"
-          else
-            echo "No changes to commit"
-          fi
-
-      - name: Verify if there are any diff files after pre-commit
-        if: github.actor != 'dependabot[bot]'
-        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
-
-      - name: Verify if there are any new files after pre-commit
-        if: github.actor != 'dependabot[bot]'
-        run: |
-          unstaged_files=$(git ls-files --others --exclude-standard)
-          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
-            echo "$unstaged_files"
-            exit 1
-          fi
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -1,154 +0,0 @@
-name: Test Llama Stack Build
-
-run-name: Test llama stack build
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
-      - 'pyproject.toml'
-
-  pull_request:
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
-      - 'pyproject.toml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      distros: ${{ steps.set-matrix.outputs.distros }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Generate Distribution List
-        id: set-matrix
-        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "distros=$distros" >> "$GITHUB_OUTPUT"
-
-  build:
-    needs: generate-matrix
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
-        image-type: [venv, container]
-      fail-fast: false # We want to run all jobs even if some fail
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Print build dependencies
-        run: |
-          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
-
-      - name: Run Llama Stack Build
-        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
-
-      - name: Print dependencies in the image
-        if: matrix.image-type == 'venv'
-        run: |
-          uv pip list
-
-  build-single-provider:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
-
-  build-custom-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
-          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
-
-      - name: Inspect the container image entrypoint
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-  build-ubi9-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Pin distribution to UBI9 base
-        run: |
-          yq -i '
-            .image_type    = "container" |
-            .image_name    = "ubi9-test" |
-            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/distributions/ci-tests/build.yaml
-
-      - name: Build dev container (UBI9)
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
-        run: |
-          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
-
-      - name: Inspect UBI9 image
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-          echo "Checking /etc/os-release in $IMAGE_ID"
-          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
-              'source /etc/os-release && echo "$ID"' \
-              | grep -qE '^(rhel|ubi)$' \
-              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -1,49 +0,0 @@
-name: Python Package Build Test
-
-run-name: Test building the llama-stack PyPI project
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-    paths-ignore:
-        - 'llama_stack/ui/**'
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['3.12', '3.13']
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-    - name: Install uv
-      uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
-      with:
-        python-version: ${{ matrix.python-version }}
-        activate-environment: true
-        version: 0.7.6
-
-    - name: Build Llama Stack package
-      run: |
-        uv build
-
-    - name: Install Llama Stack package
-      run: |
-        uv pip install dist/*.whl
-
-    - name: Verify Llama Stack package
-      run: |
-        uv pip list
-        uv pip show llama-stack
-        command -v llama
-        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
-        llama model list
-        llama stack list-apis
-        llama stack list-providers inference
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,70 +0,0 @@
-# This workflow should be run manually when needing to re-record tests. This happens when you have
-#  - added a new test
-#  - or changed an existing test such that a new inference call is made
-# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
-# tests and commit the recordings to the PR branch.
-name: Integration Tests (Record)
-
-run-name: Run the integration test suite from tests/integration
-
-on:
-  workflow_dispatch:
-    inputs:
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-provider:
-        description: 'Test against a specific provider'
-        type: string
-        default: 'ollama'
-      run-vision-tests:
-        description: 'Whether to run vision tests'
-        type: boolean
-        default: false
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''
-
-jobs:
-  record-tests:
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: write
-
-    steps:
-      - name: Echo workflow inputs
-        run: |
-          echo "::group::Workflow Inputs"
-          echo "test-subdirs: ${{ inputs.test-subdirs }}"
-          echo "test-provider: ${{ inputs.test-provider }}"
-          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
-          echo "test-pattern: ${{ inputs.test-pattern }}"
-          echo "branch: ${{ github.ref_name }}"
-          echo "::endgroup::"
-
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          fetch-depth: 0
-
-      - name: Setup test environment
-        uses: ./.github/actions/setup-test-environment
-        with:
-          python-version: "3.12"  # Use single Python version for recording
-          client-version: "latest"
-          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
-          inference-mode: 'record'
-
-      - name: Run and record tests
-        uses: ./.github/actions/run-and-record-tests
-        with:
-          test-pattern: ${{ inputs.test-pattern }}
-          test-subdirs: ${{ inputs.test-subdirs }}
-          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
-          provider: ${{ inputs.test-provider || 'ollama' }}
-          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -1,27 +0,0 @@
-name: Check semantic PR titles
-
-run-name: Ensure that PR titles follow the conventional commit spec
-
-on:
-  pull_request_target:
-    types:
-      - opened
-      - edited
-      - reopened
-      - synchronize
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  title-check:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -1,47 +0,0 @@
-name: Close stale issues and PRs
-
-run-name: Run the Stale Bot action
-
-on:
-  schedule:
-    - cron: '0 0 * * *' # every day at midnight
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  stale:
-    permissions:
-      issues: write
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stale Action
-        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
-        with:
-          stale-issue-label: 'stale'
-          stale-issue-message: >
-            This issue has been automatically marked as stale because it has not had activity within 60 days.
-            It will be automatically closed if no further activity occurs within 30 days.
-          close-issue-message: >
-            This issue has been automatically closed due to inactivity.
-            Please feel free to reopen if you feel it is still relevant!
-          days-before-issue-stale: 60
-          days-before-issue-close: 30
-          stale-pr-label: 'stale'
-          stale-pr-message: >
-            This pull request has been automatically marked as stale because it has not had activity within 60 days.
-            It will be automatically closed if no further activity occurs within 30 days.
-          close-pr-message: >
-            This pull request has been automatically closed due to inactivity.
-            Please feel free to reopen if you intend to continue working on it!
-          days-before-pr-stale: 60
-          days-before-pr-close: 30
-          operations-per-run: 300
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -1,86 +0,0 @@
-name: Test External Providers Installed via Module
-
-run-name: Test External Provider installation via Python module
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'tests/external/*'
-      - '.github/workflows/test-external-provider-module.yml' # This workflow
-
-jobs:
-  test-external-providers-from-module:
-    # This workflow is disabled. See https://github.com/meta-llama/llama-stack/pull/2975#issuecomment-3138702984 for details
-    if: false
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Install Ramalama
-        shell: bash
-        run: |
-          uv pip install ramalama
-
-      - name: Run Ramalama
-        shell: bash
-        run: |
-          nohup ramalama serve llama3.2:3b-instruct-fp16  > ramalama_server.log 2>&1 &
-      - name: Apply image type to config file
-        run: |
-          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
-          cat tests/external/ramalama-stack/run.yaml
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "llama3.2:3b-instruct-fp16"
-          LLAMA_STACK_LOG_FILE: "server.log"
-        run: |
-          # Use the virtual environment created by the build step (name comes from build config)
-          source ramalama-stack-test/bin/activate
-          uv pip list
-          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          for i in {1..30}; do
-            if ! grep -q "successfully connected to Ramalama" server.log; then
-              echo "Waiting for Llama Stack server to load the provider..."
-              sleep 1
-            else
-              echo "Provider loaded"
-              exit 0
-            fi
-          done
-          echo "Provider failed to load"
-          cat server.log
-          exit 1
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-provider-module-test
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -1,89 +0,0 @@
-name: Test External API and Providers
-
-run-name: Test the External API and Provider mechanisms
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - 'tests/external/*'
-      - '.github/workflows/test-external.yml' # This workflow
-
-jobs:
-  test-external:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Create API configuration
-        run: |
-          mkdir -p /home/runner/.llama/apis.d
-          cp tests/external/weather.yaml /home/runner/.llama/apis.d/weather.yaml
-
-      - name: Create provider configuration
-        run: |
-          mkdir -p /home/runner/.llama/providers.d/remote/weather
-          cp tests/external/kaze.yaml /home/runner/.llama/providers.d/remote/weather/kaze.yaml
-
-      - name: Print distro dependencies
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-          LLAMA_STACK_LOG_FILE: "server.log"
-        run: |
-          # Use the virtual environment created by the build step (name comes from build config)
-          source ci-test/bin/activate
-          uv pip list
-          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -sSf http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Test external API
-        run: |
-          curl -sSf http://localhost:8321/v1/weather/locations
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-test
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -1,55 +0,0 @@
-name: UI Tests
-
-run-name: Run the UI test suite
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/ui/**'
-      - '.github/workflows/ui-unit-tests.yml' # This workflow
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  ui-tests:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        node-version: [22]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Setup Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
-        with:
-          node-version: ${{ matrix.node-version }}
-          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/package-lock.json'
-
-      - name: Install dependencies
-        working-directory: llama_stack/ui
-        run: npm ci
-
-      - name: Run linting
-        working-directory: llama_stack/ui
-        run: npm run lint
-
-      - name: Run format check
-        working-directory: llama_stack/ui
-        run: npm run format:check
-
-      - name: Run unit tests
-        working-directory: llama_stack/ui
-        env:
-          CI: true
-
-        run: npm test -- --coverage --watchAll=false --passWithNoTests
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -1,55 +0,0 @@
-name: Unit Tests
-
-run-name: Run the unit test suite
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/unit/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/unit-tests.yml' # This workflow
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python:
-          - "3.12"
-          - "3.13"
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-        with:
-          python-version: ${{ matrix.python }}
-
-      - name: Run unit tests
-        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml
-
-      - name: Upload test results
-        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: test-results-${{ matrix.python }}
-          path: |
-            .pytest_cache/
-            pytest-report-${{ matrix.python }}.xml
-            htmlcov-${{ matrix.python }}/
-          retention-days: 7
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,70 +0,0 @@
-name: Update ReadTheDocs
-
-run-name: Update the Llama Stack ReadTheDocs site
-
-on:
-  workflow_dispatch:
-    inputs:
-      branch:
-        description: 'RTD version to update'
-        required: false
-        default: 'latest'
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-    tags:
-      - '*'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  update-readthedocs:
-    runs-on: ubuntu-latest
-    env:
-      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build HTML
-        run: |
-          cd docs
-          uv run make html
-
-      - name: Trigger ReadTheDocs build
-        if: github.event_name != 'pull_request'
-        run: |
-          if [ -z "$TOKEN" ]; then
-            echo "READTHEDOCS_TOKEN is not set"
-            exit 1
-          fi
-
-          response=$(curl -X POST \
-            -H "Content-Type: application/json" \
-            -d "{
-              \"token\": \"$TOKEN\",
-              \"version\": \"$GITHUB_REF_NAME\"
-            }" \
-            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
-
-          echo "Response: $response"
-          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
-            echo "Failed to trigger ReadTheDocs build"
-            exit 1
-          fi
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,6 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
-.cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -18,13 +17,3 @@ Package.resolved
 .venv/
 .vscode
 _build
-docs/src
-# Sample tool-calling datasets generated by NVIDIA notebooks
-docs/notebooks/nvidia/tool_calling/sample_data/
-pyrightconfig.json
-venv/
-pytest-report.xml
-.coverage
-.python-version
-CLAUDE.md
-.claude/
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "llama_stack/providers/impls/ios/inference/executorch"]
+	path = llama_stack/providers/inline/ios/inference/executorch
+	url = https://github.com/pytorch/executorch
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,35 +1,26 @@
 exclude: 'build/'

 default_language_version:
-    python: python3.12
-    node: "22"
+    python: python3

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0  # Latest stable version
+    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
    hooks:
-    -   id: check-merge-conflict
-        args: ['--assume-in-merge']
    -   id: trailing-whitespace
-        exclude: '\.py$'  # Exclude Python files as Ruff already handles them
+    -   id: check-ast
+    -   id: check-merge-conflict
    -   id: check-added-large-files
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
-        exclude: '^(.*\.svg|.*\.md)$'
-    -   id: no-commit-to-branch
-    -   id: check-yaml
-        args: ["--unsafe"]
-    -   id: detect-private-key
-    -   id: mixed-line-ending
-        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
-    -   id: check-executables-have-shebangs
-    -   id: check-json
-    -   id: check-shebang-scripts-are-executable
-    -   id: check-symlinks
-    -   id: check-toml
+        exclude: '^(.*\.svg)$'
+
+# Temporarily disabling this
+#    -   id: no-commit-to-branch
+#        args: ['--branch=main']

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.5
+    rev: v1.5.4
    hooks:
    -   id: insert-license
        files: \.py$|\.sh$
@ -37,38 +28,29 @@ repos:
          - --license-filepath
          - docs/license_header.txt

-   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.2
+-   repo: https://github.com/pycqa/flake8
+    rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
    hooks:
-    -   id: ruff
-        args: [ --fix ]
-        exclude: ^llama_stack/strong_typing/.*$
-    -   id: ruff-format
-
-   repo: https://github.com/adamchainz/blacken-docs
-    rev: 1.19.1
-    hooks:
-    -   id: blacken-docs
+    -   id: flake8
        additional_dependencies:
-        - black==24.3.0
+          - flake8-bugbear == 22.4.25
+          - pep8-naming == 0.12.1
+          - torchfix
+        args: ['--config=.flake8']

-   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.20
+-   repo: https://github.com/omnilib/ufmt
+    rev: v2.7.0
    hooks:
-    -   id: uv-lock
-
-   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
-    hooks:
-    -   id: mypy
+    -   id: ufmt
        additional_dependencies:
-          - uv==0.6.2
-          - mypy
-          - pytest
-          - rich
-          - types-requests
-          - pydantic
-        pass_filenames: false
+          - black == 24.4.2
+          - usort == 1.0.8
+
+# - repo: https://github.com/jsh9/pydoclint
+#   rev: d88180a8632bb1602a4d81344085cf320f288c5a
+#   hooks:
+#     - id: pydoclint
+#       args: [--config=pyproject.toml]

 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
@ -76,126 +58,16 @@ repos:
 #     - id: markdown-link-check
 #       args: ['--quiet']

-   repo: local
-    hooks:
-      - id: distro-codegen
-        name: Distribution Template Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/distro_codegen.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
-      - id: provider-codegen
-        name: Provider Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/provider_codegen.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/providers/.*$
-      - id: openapi-codegen
-        name: API Spec Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/apis/|^docs/openapi_generator/
-      - id: check-workflows-use-hashes
-        name: Check GitHub Actions use SHA-pinned actions
-        entry: ./scripts/check-workflows-use-hashes.sh
-        language: system
-        pass_filenames: false
-        require_serial: true
-        always_run: true
-        files: ^\.github/workflows/.*\.ya?ml$
-      - id: check-init-py
-        name: Check for missing __init__.py files
-        entry: ./scripts/check-init-py.sh
-        language: system
-        pass_filenames: false
-        require_serial: true
-        always_run: true
-        files: ^llama_stack/.*$
-      - id: forbid-pytest-asyncio
-        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
-        entry: bash
-        language: system
-        types: [python]
-        pass_filenames: true
-        args:
-          - -c
-          - |
-            grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
-              echo;
-              echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
-              echo "   pytest is already configured with async-mode=auto."
-              echo;
-              exit 1;
-            } || true
-      - id: generate-ci-docs
-        name: Generate CI documentation
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run ./scripts/gen-ci-docs.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^.github/workflows/.*$
-      # ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-      # and until we have infra for installing prettier and next via npm -
-      #   Lint UI code with ESLint.....................................................Failed
-      #   - hook id: ui-eslint
-      #   - exit code: 127
-      #   > ui@0.1.0 lint
-      #   > next lint --fix --quiet
-      #   sh: line 1: next: command not found
-      #
-      # - id: ui-prettier
-      #   name: Format UI code with Prettier
-      #   entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-      # - id: ui-eslint
-      #   name: Lint UI code with ESLint
-      #   entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-
-      - id: check-log-usage
-        name: Ensure 'llama_stack.log' usage for logging
-        entry: bash
-        language: system
-        types: [python]
-        pass_filenames: true
-        args:
-          - -c
-          - |
-            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
-            if [ -n "$matches" ]; then
-              # GitHub Actions annotation format
-              while IFS=: read -r file line_num rest; do
-                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
-              done <<< "$matches"
-              exit 1
-            fi
-            exit 0
-
-ci:
-    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
-    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
-    autofix_prs: true
-    autoupdate_branch: ''
-    autoupdate_schedule: weekly
-    skip: []
-    submodules: false
+# -   repo: local
+#     hooks:
+#       - id: distro-codegen
+#         name: Distribution Template Codegen
+#         additional_dependencies:
+#           - rich
+#           - pydantic
+#         entry: python -m llama_stack.scripts.distro_codegen
+#         language: python
+#         pass_filenames: false
+#         require_serial: true
+#         files: ^llama_stack/templates/.*$
+#         stages: [manual]
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,21 +5,28 @@
 # Required
 version: 2

-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-  jobs:
-    pre_create_environment:
-      - asdf plugin add uv
-      - asdf install uv latest
-      - asdf global uv latest
-    create_environment:
-      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
   install:
-      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
+   - requirements: docs/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,516 +1,35 @@
 # Changelog

-# v0.2.15
-Published on: 2025-07-16T03:30:01Z
-
-
-
---
-
-# v0.2.14
-Published on: 2025-07-04T16:06:48Z
-
-## Highlights
-
-* Support for Llama Guard 4
-* Added Milvus  support to vector-stores API
-* Documentation and zero-to-hero updates for latest APIs
-
-
---
-
-# v0.2.13
-Published on: 2025-06-28T04:28:11Z
-
-## Highlights
-* search_mode support in OpenAI vector store API
-* Security fixes
-
-
---
-
-# v0.2.12
-Published on: 2025-06-20T22:52:12Z
-
-## Highlights
-* Filter support in file search
-* Support auth attributes in inference and response stores
-
-
---
-
-# v0.2.11
-Published on: 2025-06-17T20:26:26Z
-
-## Highlights
-* OpenAI-compatible vector store APIs
-* Hybrid Search in Sqlite-vec
-* File search tool in Responses API
-* Pagination in inference and response stores
-* Added `suffix` to completions API for fill-in-the-middle tasks
-
-
---
-
-# v0.2.10.1
-Published on: 2025-06-06T20:11:02Z
-
-## Highlights
-* ChromaDB provider fix
-
-
---
-
-# v0.2.10
-Published on: 2025-06-05T23:21:45Z
-
-## Highlights
-
-* OpenAI-compatible embeddings API
-* OpenAI-compatible Files API
-* Postgres support in starter distro
-* Enable ingestion of precomputed embeddings
-* Full multi-turn support in Responses API
-* Fine-grained access control policy
-
-
---
-
-# v0.2.9
-Published on: 2025-05-30T20:01:56Z
-
-## Highlights
-* Added initial streaming support in Responses API
-* UI view for Responses
-* Postgres inference store support
-
-
---
-
-# v0.2.8
-Published on: 2025-05-27T21:03:47Z
-
-# Release v0.2.8
-
-## Highlights
-
-* Server-side MCP with auth firewalls now works in the Stack - both for Agents and Responses
-* Get chat completions APIs and UI to show chat completions
-* Enable keyword search for sqlite-vec
-
-
---
-
-# v0.2.7
-Published on: 2025-05-16T20:38:10Z
-
-## Highlights
-
-This is a small update. But a couple highlights:
-
-* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
-* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
-* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
-
-
---
-
-# v0.2.6
-Published on: 2025-05-12T18:06:52Z
-
-
-
---
-
-# v0.2.5
-Published on: 2025-05-04T20:16:49Z
-
-
-
---
-
-# v0.2.4
-Published on: 2025-04-29T17:26:01Z
-
-## Highlights
-
-* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
-* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
-* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
-* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
-* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
-
-
---
-
-# v0.2.3
-Published on: 2025-04-25T22:46:21Z
-
-## Highlights
-
-* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
-* significant improvements and functionality added to the nVIDIA distribution
-* many improvements to the test verification suite.
-* new inference providers: Ramalama, IBM WatsonX
-* many improvements to the Playground UI
-
-
---
-
-# v0.2.2
-Published on: 2025-04-13T01:19:49Z
-
-## Main changes
-
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
- OpenAI compatible inference API in progress (@bbrowning)
- Provider verifications (@ehhuang)
- Many updates and fixes to playground
- Several llama4 related fixes
-
-
---
-
-# v0.2.1
-Published on: 2025-04-05T23:13:00Z
-
-
-
---
-
-# v0.2.0
-Published on: 2025-04-05T19:04:29Z
-
-## Llama 4 Support
-
-Checkout more at https://www.llama.com
-
-
-
---
-
-# v0.1.9
-Published on: 2025-03-29T00:52:23Z
-
-### Build and Test Agents
-* Agents: Entire document context with attachments
-* RAG: Documentation with sqlite-vec faiss comparison
-* Getting started: Fixes to getting started notebook.
-
-### Agent Evals and Model Customization
-* (**New**) Post-training: Add nemo customizer
-
-### Better Engineering
-* Moved sqlite-vec to non-blocking calls
-* Don't return a payload on file delete
-
-
-
---
-
-# v0.1.8
-Published on: 2025-03-24T01:28:50Z
-
-# v0.1.8 Release Notes
-
-### Build and Test Agents
-* Safety: Integrated NVIDIA as a safety provider.
-* VectorDB: Added Qdrant as an inline provider.
-* Agents: Added support for multiple tool groups in agents.
-* Agents: Simplified imports for Agents in client package
-
-
-### Agent Evals and Model Customization
-* Introduced DocVQA and IfEval benchmarks.
-
-### Deploying and Monitoring Agents
-* Introduced a Containerfile and image workflow for the Playground.
-* Implemented support for Bearer (API Key) authentication.
-* Added attribute-based access control for resources.
-* Fixes on docker deployments: use --pull always and standardized the default port to 8321
-* Deprecated: /v1/inspect/providers use /v1/providers/ instead
-
-### Better Engineering
-* Consolidated scripts under the ./scripts directory.
-* Addressed mypy violations in various modules.
-* Added Dependabot scans for Python dependencies.
-* Implemented a scheduled workflow to update the changelog automatically.
-* Enforced concurrency to reduce CI loads.
-
-
-### New Contributors
-* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
-* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
-* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
-* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
-
---
-
-# v0.1.7
-Published on: 2025-03-14T22:30:51Z
-
-## 0.1.7 Release Notes
-
-###  Build and Test Agents
-* Inference: ImageType is now refactored to LlamaStackImageType
-* Inference: Added tests to measure TTFT
-* Inference: Bring back usage metrics
-* Agents: Added endpoint for get agent, list agents and list sessions
-* Agents: Automated conversion of type hints in client tool for lite llm format
-* Agents: Deprecated ToolResponseMessage in agent.resume API
-* Added Provider API for listing and inspecting provider info
-
-### Agent Evals and Model Customization
-* Eval: Added new eval benchmarks Math 500 and BFCL v3
-* Deploy and Monitoring of Agents
-* Telemetry: Fix tracing to work across coroutines
-
-###  Better Engineering
-* Display code coverage for unit tests
-* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
-* Unit tests also run on Python 3.11, 3.12, and 3.13
-* Added ollama inference to Integration tests CI
-* Improved documentation across examples, testing, CLI, updated providers table )
-
-
-
-
---
-
-# v0.1.6
-Published on: 2025-03-08T04:35:08Z
-
-## 0.1.6 Release Notes
-
-### Build and Test Agents
-* Inference: Fixed support for inline vllm provider
-* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
-* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
-* Agent: Unify tools and Python SDK Agents API
-* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
-* Agent: Support python functions without @client_tool decorator as client tools
-* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
-* VectorIO: MilvusDB support added
-
-### Agent Evals and Model Customization
-* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
-* Eval: Documentation for eval, scoring, adding new benchmarks
-* Eval: Distribution template to run benchmarks on llama & non-llama models
-* Eval: Ability to register new custom LLM-as-judge scoring functions
-* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
-
-### Deploy and Monitoring of Agents
-* Better support for different log levels across all components for better monitoring
-
-### Better Engineering
-* Enhance OpenAPI spec to include Error types across all APIs
-* Moved all tests to /tests and created unit tests to run on each PR
-* Removed all dependencies on llama-models repo
-
-
---
-
-# v0.1.5.1
-Published on: 2025-02-28T22:37:44Z
-
-## 0.1.5.1 Release Notes
-* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
-
---
-
-# v0.1.5
-Published on: 2025-02-28T18:14:01Z
-
-## 0.1.5 Release Notes
-###  Build Agents
-* Inference: Support more non-llama models (openai, anthropic, gemini)
-* Inference: Can use the provider's model name in addition to the HF alias
-* Inference: Fixed issues with calling tools that weren't specified in the prompt
-* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
-* Embeddings: Added support for Nemo retriever embedding models
-* Tools: Added support for MCP tools in Ollama Distribution
-* Distributions: Added new Groq distribution
-
-### Customize Models
-* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
-
-### Monitor agents
-* More comprehensive logging of agent steps including client tools
-* Telemetry inputs/outputs are now structured and queryable
-* Ability to retrieve agents session, turn, step by ids
-
-### Better Engineering
-* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
-* Move most logging to use logger instead of prints
-* Completed text /chat-completion and /completion tests
-
-
---
-
-# v0.1.4
-Published on: 2025-02-25T00:02:43Z
-
-## v0.1.4 Release Notes
-Here are the key changes coming as part of this release:
-
-### Build and Test Agents
-* Inference: Added support for non-llama models
-* Inference: Added option to list all downloaded models and remove models
-* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
-* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
-* Agent: Added logging for agent step start and completion times
-* Agent: Added support for logging for tool execution metadata
-* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
-* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
-* VectorIO: Improved performance of sqlite-vec using chunked writes
-### Agent Evals and Model Customization
-* Deprecated api /eval-tasks. Use /eval/benchmark  instead
-* Added CPU training support for TorchTune
-### Deploy and Monitoring of Agents
-* Consistent view of client and server tool calls in telemetry
-### Better Engineering
-* Made tests more data-driven for consistent evaluation
-* Fixed documentation links and improved API reference generation
-* Various small fixes for build scripts and system reliability
-
-
-
---
-
-# v0.1.3
-Published on: 2025-02-14T20:24:32Z
-
-## v0.1.3 Release
-
-Here are some key changes that are coming as part of this release.
-
-### Build and Test Agents
-Streamlined the initial development experience
- Added support for  llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
-
-### Agent Evals
-Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
-
-### Deploy and Monitoring of Agents
-Improved production readiness
- Added usage metrics collection for chat completions
- CLI improvements for provider information
- Improved error handling and system reliability
- Better model endpoint handling and accessibility
- Improved signal handling on distro server
-
-### Better Engineering
-Infrastructure and code quality improvements
- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
- Added conventional commits standard
- Fixed documentation parsing issues
-
-
---
-
-# v0.1.2
-Published on: 2025-02-07T22:06:49Z
-
-# TL;DR
- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
-
-
---
-
-# v0.1.1
-Published on: 2025-02-02T02:29:24Z
-
-A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
-
-
---
-
-# v0.1.0
-Published on: 2025-01-24T17:47:47Z
-
-We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
-
-## Context
-GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
-
-Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
-
-## Release
-After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
-
-There are example standalone apps in llama-stack-apps.
-
-
-## Key Features of this release
-
- **Unified API Layer**
-  - Inference: Run LLM models
-  - RAG: Store and retrieve knowledge for RAG
-  - Agents: Build multi-step agentic workflows
-  - Tools: Register tools that can be called by the agent
-  - Safety: Apply content filtering and safety policies
-  - Evaluation: Test model and agent quality
-  - Telemetry: Collect and analyze usage data and complex agentic traces
-  - Post Training ( Coming Soon ): Fine tune models for specific use cases
-
- **Rich Provider Ecosystem**
-  - Local Development: Meta's Reference, Ollama
-  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
-  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
-  - On-device: iOS and Android support
-
- **Built for Production**
-  - Pre-packaged distributions for common deployment scenarios
-  - Backwards compatibility across model versions
-  - Comprehensive evaluation capabilities
-  - Full observability and monitoring
-
- **Multiple developer interfaces**
-  - CLI: Command line interface
-  - Python SDK
-  - Swift iOS SDK
-  - Kotlin Android SDK
-
- **Sample llama stack applications**
-  - Python
-  - iOS
-  - Android
-
-
-
---
-
-# v0.1.0rc12
-Published on: 2025-01-22T22:24:01Z
-
-
-
---
-
-# v0.0.63
-Published on: 2024-12-18T07:17:43Z
-
-A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
-
---
-
+## 0.0.53
+
+### Added
+- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
+- Persistence for registered objects with distribution
+- Ability to persist memory banks created for FAISS
+- PostgreSQL KVStore implementation
+- Environment variable placeholder support in run.yaml files
+- Comprehensive Zero-to-Hero notebooks and quickstart guides
+- Support for quantized models in Ollama
+- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
+- Bedrock distribution with safety shields support
+- Evals API with task registration and scoring functions
+- MMLU and SimpleQA benchmark scoring functions
+- Huggingface dataset provider integration for benchmarks
+- Support for custom dataset registration from local paths
+- Benchmark evaluation CLI tools with visualization tables
+- RAG evaluation scoring functions and metrics
+- Local persistence for datasets and eval tasks
+
+### Changed
+- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
+- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
+- Updated API signatures for dataset and eval task registration
+- Restructured folder organization for providers
+- Enhanced Docker build configuration
+- Added version prefixing for REST API routes
+- Enhanced evaluation task registration workflow
+- Improved benchmark evaluation output formatting
+- Restructured evals folder organization for better modularity
+
+### Removed
+- `llama stack configure` command
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,69 +1,56 @@
-# Contributing to Llama Stack
+# Contributing to Llama-Stack
 We want to make contributing to this project as easy and transparent as
 possible.

-## Set up your development environment
+## Pull Requests
+We actively welcome your pull requests.

-We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
-You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").

-You can install the dependencies by running:
+
+### Updating Provider Configurations
+
+If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
+
+### Building the Documentation
+
+If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
-cd llama-stack
-uv sync --group dev
-uv pip install -e .
-source .venv/bin/activate
+cd llama-stack/docs
+pip install -r requirements.txt
+pip install sphinx-autobuild
+
+# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
+make html
+sphinx-autobuild source build/html
 ```

-```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
-For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-```
-
-Note that you can create a dotenv file `.env` that includes necessary environment variables:
-```
-LLAMA_STACK_BASE_URL=http://localhost:8321
-LLAMA_STACK_CLIENT_LOG=debug
-LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=<provider-name>
-TAVILY_SEARCH_API_KEY=
-BRAVE_SEARCH_API_KEY=
-```
-
-And then use this dotenv file when running client SDK tests via the following:
-```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
-```
-
-### Pre-commit Hooks
+## Pre-commit Hooks

 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:

 ```bash
-uv run pre-commit install
+$ cd llama-stack
+$ conda activate <your-environment>
+$ pip install pre-commit
+$ pre-commit install
 ```

 After that, pre-commit hooks will run automatically before each commit.

-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.

-```bash
-uv run pre-commit run --all-files
-```
+Complete your CLA here: <https://code.facebook.com/cla>

-```{caution}
-Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-```
-
-## Discussions -> Issues -> Pull Requests
-
-We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
-
-If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
-
-### Issues
+## Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.

@ -71,138 +58,14 @@ Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.

-### Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Meta's open source projects.
+## Coding Style
+* 2 spaces for indentation rather than tabs
+* 80 character line length
+* ...

-Complete your CLA here: <https://code.facebook.com/cla>
+## Tips
+* If you are developing with a llama-stack repository checked out and need your distribution to reflect changes from there, set `LLAMA_STACK_DIR` to that dir when running any of the `llama` CLI commands.

-**I'd like to contribute!**
-
-If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
-leave a comment on the issue and a triager will assign it to you.
-
-Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
- Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
- Before taking an issue, check if it’s already assigned or being actively discussed.
- If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
-
-**I have a bug!**
-
-1. Search the issue tracker and discussions for similar issues.
-2. If you don't have steps to reproduce, open a discussion.
-3. If you have steps to reproduce, open an issue.
-
-**I have an idea for a feature!**
-
-1. Open a discussion.
-
-**I've implemented a feature!**
-
-1. If there is an issue for the feature, open a pull request.
-2. If there is no issue, open a discussion and link to your branch.
-
-**I have a question!**
-
-1. Open a discussion or use [Discord](https://discord.gg/llama-stack).
-
-
-**Opening a Pull Request**
-
-1. Fork the repo and create your branch from `main`.
-2. If you've changed APIs, update the documentation.
-3. Ensure the test suite passes.
-4. Make sure your code lints using `pre-commit`.
-5. If you haven't already, complete the Contributor License Agreement ("CLA").
-6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
-7. Ensure your pull request follows the [coding style](#coding-style).
-
-
-Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
-
-```{tip}
-As a general guideline:
- Experienced contributors should try to keep no more than 5 open PRs at a time.
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
-```
-
-## Repository guidelines
-
-### Coding Style
-
-* Comments should provide meaningful insights into the code. Avoid filler comments that simply
-  describe the next step, as they create unnecessary clutter, same goes for docstrings.
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
-  rather than explain what the next line of code does.
-* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
-  `Exception`.
-* Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tab
-* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
-  justification for bypassing the check.
-* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
-  justification for bypassing the check.
-* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
-  readability reasons.
-* Providers configuration class should be Pydantic Field class. It should have a `description` field
-  that describes the configuration. These descriptions will be used to generate the provider
-  documentation.
-* When possible, use keyword arguments only when calling functions.
-* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
-
-### License
+## License
 By contributing to Llama, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
-
-## Common Tasks
-
-Some tips about common tasks you work on while contributing to Llama Stack:
-
-### Using `llama stack build`
-
-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
-
-Example:
-```bash
-cd work/
-git clone https://github.com/meta-llama/llama-stack.git
-git clone https://github.com/meta-llama/llama-stack-client-python.git
-cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
-```
-
-### Updating distribution configurations
-
-If you have made changes to a provider's configuration in any form (introducing a new config key, or
-changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML
-files as well as the documentation. You should not change `docs/source/.../distributions/` files
-manually as they are auto-generated.
-
-### Updating the provider documentation
-
-If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
-to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
-as they are auto-generated.
-Note that the provider "description" field will be used to generate the provider documentation.
-
-### Building the Documentation
-
-If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
-
-```bash
-# This rebuilds the documentation pages.
-uv run --group docs make -C docs/ html
-
-# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
-```
-
-### Update API Documentation
-
-If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
-
-```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
-```
-
-The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,5 @@
-include pyproject.toml
-include llama_stack/models/llama/llama3/tokenizer.model
-include llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/core/*.sh
+include requirements.txt
+include distributions/dependencies.json
+include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
-include llama_stack/distributions/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
-include llama_stack/models/llama/*/*.md
-include llama_stack/tests/integration/*.jpg
+include llama_stack/templates/*/*.yaml
--- a/README.md
+++ b/README.md
@ -1,196 +1,120 @@
+<img src="https://github.com/user-attachments/assets/2fedfe0f-6df7-4441-98b2-87a1fd95ee1c" width="300" title="Llama Stack Logo" alt="Llama Stack Logo"/>
+
 # Llama Stack

 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
-[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
-[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
-[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
+[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html)
+
+This repository contains the Llama Stack API specifications as well as API Providers and Llama Stack Distributions.
+
+The Llama Stack defines and standardizes the building blocks needed to bring generative AI applications to market. These blocks span the entire development lifecycle: from model training and fine-tuning, through product evaluation, to building and running AI agents in production. Beyond definition, we are building providers for the Llama Stack APIs. These were developing open-source versions and partnering with providers, ensuring developers can assemble AI solutions using consistent, interlocking pieces across platforms. The ultimate goal is to accelerate innovation in the AI space.
+
+The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.


-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
+## APIs

-<details>
+The Llama Stack consists of the following set of APIs:

-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
+- Inference
+- Safety
+- Memory
+- Agentic System
+- Evaluation
+- Post Training
+- Synthetic Data Generation
+- Reward Scoring

-\
-*Note you need 8xH100 GPU-host to run these models*
-
-```bash
-pip install -U llama_stack
-
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
-
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
-
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-
-ChatCompletionResponse(
-    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
-    logprobs=None,
-    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-
-print(f"User> {prompt}")
-response = client.inference.chat_completion(
-    model_id=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.completion_message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
+Each of the APIs themselves is a collection of REST endpoints.


-</details>
+## API Providers

-### 🚀 One-Line Installer 🚀
+A Provider is what makes the API real -- they provide the actual implementation backing the API.

-To try Llama Stack locally, run:
+As an example, for Inference, we could have the implementation be backed by open source libraries like `[ torch | vLLM | TensorRT ]` as possible options.

-```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
-```
+A provider can also be just a pointer to a remote REST service -- for example, cloud providers or dedicated inference providers could serve these APIs.

-### Overview

-Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
+## Llama Stack Distribution

- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
- **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
-
-<div style="text-align: center;">
-  <img
-    src="https://github.com/user-attachments/assets/33d9576d-95ea-468d-95e2-8fa233205a50"
-    width="480"
-    title="Llama Stack"
-    alt="Llama Stack"
-  />
-</div>
-
-### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
- **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
-
-By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
+A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications.

+## Supported Llama Stack Implementations
 ### API Providers
-Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
-
-| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
-|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
-|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
-|       Cerebras       | Hosted | | ✅ | | | | | | |
-|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
-|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
-|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
-|         Groq         | Hosted | | ✅ | | | | | | |
-|        Ollama        | Single Node | | ✅ | | | | | | |
-|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
-|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
-|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
-|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
-|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
-|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
-|      SQLite-vec      | Single Node | | | ✅ | | | | | |
-|      PG Vector       | Single Node | | | ✅ | | | | | |
-|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
-|         vLLM         | Single Node | | ✅ | | | | | | |
-|        OpenAI        | Hosted | | ✅ | | | | | | |
-|      Anthropic       | Hosted | | ✅ | | | | | | |
-|        Gemini        | Hosted | | ✅ | | | | | | |
-|       WatsonX        | Hosted | | ✅ | | | | | | |
-|     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
-|      TorchTune       | Single Node | | | | | | ✅ | | |
-|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
-|        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
-
-> **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.
+|  **API Provider Builder** |  **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+|  Meta Reference  |  Single Node | :heavy_check_mark:  |  :heavy_check_mark:  |  :heavy_check_mark:  |  :heavy_check_mark:  |  :heavy_check_mark:  |
+|  Fireworks  |  Hosted  | :heavy_check_mark:  | :heavy_check_mark:  |  :heavy_check_mark:  |    |   |
+|  AWS Bedrock  |  Hosted  |    |  :heavy_check_mark:  |    | :heavy_check_mark:  | |
+|  Together  |  Hosted  |  :heavy_check_mark:  |  :heavy_check_mark:  |   | :heavy_check_mark:  |  |
+|  Ollama  | Single Node   |    |  :heavy_check_mark:  |    |   |
+|  TGI  |  Hosted and Single Node  |    |  :heavy_check_mark:  |    |   |
+| Chroma | Single Node |  |  | :heavy_check_mark: |  |  |
+| PG Vector | Single Node |  |  | :heavy_check_mark: |  |  |
+| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark:  | :heavy_check_mark:  |  |  |

 ### Distributions

-A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code.
-Here are some of the distributions we support:
-
 | **Distribution** 	|           **Llama Stack Docker**           	| Start This Distribution 	|
-|:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
-|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html)      |
-|                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
-|                   PostgreSQL                  |                [llamastack/distribution-postgres-demo](https://hub.docker.com/repository/docker/llamastack/distribution-postgres-demo/general)                |                  |
+|:----------------:	|:------------------------------------------:	|:-----------------------:	|
+|  Meta Reference  	| [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)       	|
+|  Meta Reference Quantized  	| [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html)       	|
+|      Ollama      	|       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)       	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)       	|
+|        TGI       	|         [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)       	|
+|        Together       	|         [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html)       	|
+|        Fireworks       	|         [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html)       	|

-### Documentation
+## Installation

-Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
+You have two ways to install this repository:

-* CLI references
-    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
-    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
-* Getting Started
-    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
+1. **Install as a package**:
+   You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
+   ```bash
+   pip install llama-stack
+   ```
+
+2. **Install from source**:
+   If you prefer to install from the source code, follow these steps:
+   ```bash
+    mkdir -p ~/local
+    cd ~/local
+    git clone git@github.com:meta-llama/llama-stack.git
+
+    conda create -n stack python=3.10
+    conda activate stack
+
+    cd llama-stack
+    $CONDA_PREFIX/bin/pip install -e .
+   ```
+
+## Documentations
+
+Please checkout our [Documentations](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
+
+* [CLI reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+    * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
+* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
+    * Quick guide to start a Llama Stack server.
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
-    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
-    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
+    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) to walk-through how to add a new API provider.

-### Llama Stack Client SDKs
+## Llama Stack Client SDK

 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
 | Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
-| Typescript   | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
+| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
 | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)

-Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
+Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.

 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
-
-
-## 🌟 GitHub Star History
-## Star History
-
-[![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date)
-
-## ✨ Contributors
-
-Thanks to all of our amazing contributors!
-
-<a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
-</a>
--- a/coverage.svg
+++ b/coverage.svg
@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
-    <linearGradient id="b" x2="0" y2="100%">
-        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
-        <stop offset="1" stop-opacity=".1"/>
-    </linearGradient>
-    <mask id="a">
-        <rect width="99" height="20" rx="3" fill="#fff"/>
-    </mask>
-    <g mask="url(#a)">
-        <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
-        <path fill="url(#b)" d="M0 0h99v20H0z"/>
-    </g>
-    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
-        <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
-        <text x="80" y="14">44%</text>
-    </g>
-</svg>
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -0,0 +1,15 @@
+services:
+  llamastack:
+    image: distribution-bedrock
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-bedrock.yaml
+    ports:
+      - "5000:5000"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -0,0 +1,50 @@
+services:
+  text-generation-inference:
+    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - "5009:5009"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
+      - NUM_SHARD=4
+      - MAX_BATCH_PREFILL_TOKENS=32768
+      - MAX_INPUT_TOKENS=8000
+      - MAX_TOTAL_TOKENS=8192
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: all
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+  llamastack:
+    depends_on:
+      text-generation-inference:
+        condition: service_healthy
+    image: llamastack/distribution-tgi
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to TGI run.yaml file
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "5000:5000"
+    # Hack: wait for TGI server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    restart_policy:
+      condition: on-failure
+      delay: 3s
+      max_attempts: 5
+      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -0,0 +1,44 @@
+version: '2'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: tgi0
+    provider_type: remote::tgi
+    config:
+      url: http://127.0.0.1:80
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  - provider_id: meta1
+    provider_type: inline::prompt-guard
+    config:
+      model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::faiss
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -0,0 +1,315 @@
+{
+  "hf-serverless": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "together": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "remote-vllm": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "fireworks": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "tgi": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "bedrock": [
+    "aiosqlite",
+    "blobfile",
+    "boto3",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "meta-reference-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "torch",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "meta-reference-quantized-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fbgemm-gpu",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "torch",
+    "torchao==0.5.0",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "ollama": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "ollama",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-endpoint": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ]
+}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -0,0 +1,16 @@
+services:
+  llamastack:
+    image: llamastack/distribution-fireworks
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-fireworks.yaml
+    ports:
+      - "5000:5000"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -0,0 +1,34 @@
+services:
+  llamastack:
+    image: llamastack/distribution-meta-reference-gpu
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "5000:5000"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -0,0 +1,35 @@
+services:
+  llamastack:
+    image: llamastack/distribution-meta-reference-quantized-gpu
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "5000:5000"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -0,0 +1,58 @@
+version: '2'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: meta0
+    provider_type: inline::meta-reference-quantized
+    config:
+      model: Llama3.2-3B-Instruct:int4-qlora-eo8
+      quantization:
+        type: int4
+      torch_seed: null
+      max_seq_len: 2048
+      max_batch_size: 1
+  - provider_id: meta1
+    provider_type: inline::meta-reference-quantized
+    config:
+      # not a quantized model !
+      model: Llama-Guard-3-1B
+      quantization: null
+      torch_seed: null
+      max_seq_len: 2048
+      max_batch_size: 1
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  - provider_id: meta1
+    provider_type: inline::prompt-guard
+    config:
+      model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -0,0 +1,71 @@
+services:
+  ollama:
+    image: ollama/ollama:latest
+    network_mode: ${NETWORK_MODE:-bridge}
+    volumes:
+      - ~/.ollama:/root/.ollama
+    ports:
+      - "11434:11434"
+    environment:
+      OLLAMA_DEBUG: 1
+    command: []
+    deploy:
+      resources:
+        limits:
+          memory: 8G    # Set maximum memory
+        reservations:
+          memory: 8G    # Set minimum memory reservation
+    # healthcheck:
+    #   # ugh, no CURL in ollama image
+    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
+    #   interval: 10s
+    #   timeout: 5s
+    #   retries: 5
+
+  ollama-init:
+    image: ollama/ollama:latest
+    depends_on:
+      - ollama
+        # condition: service_healthy
+    network_mode: ${NETWORK_MODE:-bridge}
+    environment:
+      - OLLAMA_HOST=ollama
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+    volumes:
+      - ~/.ollama:/root/.ollama
+      - ./pull-models.sh:/pull-models.sh
+    entrypoint: ["/pull-models.sh"]
+
+  llamastack:
+    depends_on:
+      ollama:
+        condition: service_started
+      ollama-init:
+        condition: service_started
+    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
+    network_mode: ${NETWORK_MODE:-bridge}
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to ollama run.yaml file
+      - ~/local/llama-stack/:/app/llama-stack-source
+      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+      - OLLAMA_URL=http://ollama:11434
+    entrypoint: >
+        python -m llama_stack.distribution.server.server /root/my-run.yaml \
+        --port ${LLAMA_STACK_PORT:-5001}
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+        max_attempts: 3
+        window: 60s
+volumes:
+  ollama:
+  ollama-init:
+  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -0,0 +1,18 @@
+#!/bin/sh
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
+for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
+  echo "Preloading $model..."
+  if ! ollama run "$model"; then
+    echo "Failed to pull and run $model"
+    exit 1
+  fi
+done
+
+echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -0,0 +1,100 @@
+services:
+  vllm-inference:
+    image: vllm/vllm-openai:latest
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
+      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
+    image: vllm/vllm-openai:latest
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
+      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_SAFETY_MODEL}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+  llamastack:
+    depends_on:
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
+    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
+    environment:
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - MAX_TOKENS=${MAX_TOKENS:-4096}
+      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+    ports:
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
+    # Hack: wait for vLLM server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
+volumes:
+  vllm-inference:
+  vllm-safety:
+  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -0,0 +1,103 @@
+services:
+  tgi-inference:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_INFERENCE_PORT:-8080}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  tgi-${TGI_SAFETY_MODEL:+safety}:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+      --port ${TGI_SAFETY_PORT:-8081}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
+  llamastack:
+    depends_on:
+      tgi-inference:
+        condition: service_healthy
+      tgi-${TGI_SAFETY_MODEL:+safety}:
+        condition: service_healthy
+    image: llamastack/distribution-tgi:test-0.0.52rc3
+    network_mode: ${NETWORK_MODE:-bridged}
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    # Hack: wait for TGI server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    restart_policy:
+      condition: on-failure
+      delay: 3s
+      max_attempts: 5
+      window: 60s
+    environment:
+      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
+      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+volumes:
+  tgi-inference:
+  tgi-safety:
+  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@ -0,0 +1,65 @@
+# Together Distribution
+
+### Connect to a Llama Stack Together Endpoint
+- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
+
+The `llamastack/distribution-together` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| remote::together   	| meta-reference 	| meta-reference, remote::weaviate 	| meta-reference 	| meta-reference 	|
+
+
+### Docker: Start the Distribution (Single Node CPU)
+
+> [!NOTE]
+> This assumes you have an hosted endpoint at Together with API Key.
+
+```
+$ cd distributions/together
+$ ls
+compose.yaml  run.yaml
+$ docker compose up
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
+```
+inference:
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: <optional api key>
+```
+
+### Conda llama stack run (Single Node CPU)
+
+```bash
+llama stack build --template together --image-type conda
+# -- modify run.yaml to a valid Together server endpoint
+llama stack run ./run.yaml
+```
+
+### (Optional) Update Model Serving Configuration
+
+Use `llama-stack-client models list` to check the available models served by together.
+
+```
+$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
+| identifier                   | llama_model                  | provider_id   | metadata   |
+==============================+==============================+===============+============+
+| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
+------------------------------+------------------------------+---------------+------------+
+```
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -0,0 +1,16 @@
+services:
+  llamastack:
+    image: llamastack/distribution-together
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-together.yaml
+    ports:
+      - "5000:5000"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -0,0 +1,35 @@
+services:
+  llamastack:
+    image: llamastack/distribution-inline-vllm
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/my-run.yaml
+    ports:
+      - "5000:5000"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -0,0 +1,66 @@
+version: '2'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: inline::vllm
+    config:
+      model: Llama3.2-3B-Instruct
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.4
+      enforce_eager: true
+      max_tokens: 4096
+  - provider_id: vllm-inference-safety
+    provider_type: inline::vllm
+    config:
+      model: Llama-Guard-3-1B
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.2
+      enforce_eager: true
+      max_tokens: 4096
+  safety:
+  - provider_id: meta0
+    provider_type: inline::llama-guard
+    config:
+      model: Llama-Guard-3-1B
+      excluded_categories: []
+  # Uncomment to use prompt guard
+  # - provider_id: meta1
+  #   provider_type: inline::prompt-guard
+  #   config:
+  #     model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+  # Uncomment to use pgvector
+  # - provider_id: pgvector
+  #   provider_type: remote::pgvector
+  #   config:
+  #     host: 127.0.0.1
+  #     port: 5432
+  #     db: postgres
+  #     user: postgres
+  #     password: mysecretpassword
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/agents_store.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -0,0 +1 @@
+src
--- a/docs/README.md
+++ b/docs/README.md
@ -1,19 +0,0 @@
-# Llama Stack Documentation
-
-Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
-
-## Render locally
-
-From the llama-stack root directory, run the following command to render the docs locally:
-```bash
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
-```
-You can open up the docs in your browser at http://localhost:8000
-
-## Content
-
-Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
-
-* [Building AI Applications Notebook](./getting_started.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
-* [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
-* [Zero-to-Hero Guide](./zero_to_hero_guide) - Step-by-step guide for getting started with Llama Stack
--- a/docs/_deprecating_soon.ipynb
+++ b/docs/_deprecating_soon.ipynb
@ -0,0 +1,796 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " let's explore how to have a conversation about images using the Memory API! This section will show you how to:\n",
+    "1. Load and prepare images for the API\n",
+    "2. Send image-based queries\n",
+    "3. Create an interactive chat loop with images\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "import base64\n",
+    "import mimetypes\n",
+    "from pathlib import Path\n",
+    "from typing import Optional, Union\n",
+    "\n",
+    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack_client.types import UserMessage\n",
+    "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
+    "from termcolor import cprint\n",
+    "\n",
+    "# Helper function to convert image to data URL\n",
+    "def image_to_data_url(file_path: Union[str, Path]) -> str:\n",
+    "    \"\"\"Convert an image file to a data URL format.\n",
+    "\n",
+    "    Args:\n",
+    "        file_path: Path to the image file\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Data URL containing the encoded image\n",
+    "    \"\"\"\n",
+    "    file_path = Path(file_path)\n",
+    "    if not file_path.exists():\n",
+    "        raise FileNotFoundError(f\"Image not found: {file_path}\")\n",
+    "\n",
+    "    mime_type, _ = mimetypes.guess_type(str(file_path))\n",
+    "    if mime_type is None:\n",
+    "        raise ValueError(\"Could not determine MIME type of the image\")\n",
+    "\n",
+    "    with open(file_path, \"rb\") as image_file:\n",
+    "        encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+    "\n",
+    "    return f\"data:{mime_type};base64,{encoded_string}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Create an Interactive Image Chat\n",
+    "\n",
+    "Let's create a function that enables back-and-forth conversation about an image:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image, display\n",
+    "import ipywidgets as widgets\n",
+    "\n",
+    "# Display the image we'll be chatting about\n",
+    "image_path = \"your_image.jpg\"  # Replace with your image path\n",
+    "display(Image(filename=image_path))\n",
+    "\n",
+    "# Initialize the client\n",
+    "client = LlamaStackClient(\n",
+    "    base_url=f\"http://localhost:8000\",  # Adjust host/port as needed\n",
+    ")\n",
+    "\n",
+    "# Create chat interface\n",
+    "output = widgets.Output()\n",
+    "text_input = widgets.Text(\n",
+    "    value='',\n",
+    "    placeholder='Type your question about the image...',\n",
+    "    description='Ask:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "# Display interface\n",
+    "display(text_input, output)\n",
+    "\n",
+    "# Handle chat interaction\n",
+    "async def on_submit(change):\n",
+    "    with output:\n",
+    "        question = text_input.value\n",
+    "        if question.lower() == 'exit':\n",
+    "            print(\"Chat ended.\")\n",
+    "            return\n",
+    "\n",
+    "        message = UserMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"image\": {\"uri\": image_to_data_url(image_path)}},\n",
+    "                question,\n",
+    "            ],\n",
+    "        )\n",
+    "\n",
+    "        print(f\"\\nUser> {question}\")\n",
+    "        response = client.inference.chat_completion(\n",
+    "            messages=[message],\n",
+    "            model=\"Llama3.2-11B-Vision-Instruct\",\n",
+    "            stream=True,\n",
+    "        )\n",
+    "\n",
+    "        print(\"Assistant> \", end='')\n",
+    "        async for log in EventLogger().log(response):\n",
+    "            log.print()\n",
+    "\n",
+    "        text_input.value = ''  # Clear input after sending\n",
+    "\n",
+    "text_input.on_submit(lambda x: asyncio.create_task(on_submit(x)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool Calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this section, we'll explore how to enhance your applications with tool calling capabilities. We'll cover:\n",
+    "1. Setting up and using the Brave Search API\n",
+    "2. Creating custom tools\n",
+    "3. Configuring tool prompts and safety settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "import os\n",
+    "from typing import Dict, List, Optional\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack_client.lib.agents.agent import Agent\n",
+    "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+    "from llama_stack_client.types.agent_create_params import (\n",
+    "    AgentConfig,\n",
+    "    AgentConfigToolSearchToolDefinition,\n",
+    ")\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Helper function to create an agent with tools\n",
+    "async def create_tool_agent(\n",
+    "    client: LlamaStackClient,\n",
+    "    tools: List[Dict],\n",
+    "    instructions: str = \"You are a helpful assistant\",\n",
+    "    model: str = \"Llama3.1-8B-Instruct\",\n",
+    ") -> Agent:\n",
+    "    \"\"\"Create an agent with specified tools.\"\"\"\n",
+    "    agent_config = AgentConfig(\n",
+    "        model=model,\n",
+    "        instructions=instructions,\n",
+    "        sampling_params={\n",
+    "            \"strategy\": \"greedy\",\n",
+    "            \"temperature\": 1.0,\n",
+    "            \"top_p\": 0.9,\n",
+    "        },\n",
+    "        tools=tools,\n",
+    "        tool_choice=\"auto\",\n",
+    "        tool_prompt_format=\"json\",\n",
+    "        input_shields=[\"Llama-Guard-3-1B\"],\n",
+    "        output_shields=[\"Llama-Guard-3-1B\"],\n",
+    "        enable_session_persistence=True,\n",
+    "    )\n",
+    "\n",
+    "    return Agent(client, agent_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, create a `.env` file in your notebook directory with your Brave Search API key:\n",
+    "\n",
+    "```\n",
+    "BRAVE_SEARCH_API_KEY=your_key_here\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def create_search_agent(client: LlamaStackClient) -> Agent:\n",
+    "    \"\"\"Create an agent with Brave Search capability.\"\"\"\n",
+    "    search_tool = AgentConfigToolSearchToolDefinition(\n",
+    "        type=\"brave_search\",\n",
+    "        engine=\"brave\",\n",
+    "        api_key=os.getenv(\"BRAVE_SEARCH_API_KEY\"),\n",
+    "    )\n",
+    "\n",
+    "    return await create_tool_agent(\n",
+    "        client=client,\n",
+    "        tools=[search_tool],\n",
+    "        instructions=\"\"\"\n",
+    "        You are a research assistant that can search the web.\n",
+    "        Always cite your sources with URLs when providing information.\n",
+    "        Format your responses as:\n",
+    "\n",
+    "        FINDINGS:\n",
+    "        [Your summary here]\n",
+    "\n",
+    "        SOURCES:\n",
+    "        - [Source title](URL)\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "# Example usage\n",
+    "async def search_example():\n",
+    "    client = LlamaStackClient(base_url=\"http://localhost:8000\")\n",
+    "    agent = await create_search_agent(client)\n",
+    "\n",
+    "    # Create a session\n",
+    "    session_id = agent.create_session(\"search-session\")\n",
+    "\n",
+    "    # Example queries\n",
+    "    queries = [\n",
+    "        \"What are the latest developments in quantum computing?\",\n",
+    "        \"Who won the most recent Super Bowl?\",\n",
+    "    ]\n",
+    "\n",
+    "    for query in queries:\n",
+    "        print(f\"\\nQuery: {query}\")\n",
+    "        print(\"-\" * 50)\n",
+    "\n",
+    "        response = agent.create_turn(\n",
+    "            messages=[{\"role\": \"user\", \"content\": query}],\n",
+    "            session_id=session_id,\n",
+    "        )\n",
+    "\n",
+    "        async for log in EventLogger().log(response):\n",
+    "            log.print()\n",
+    "\n",
+    "# Run the example (in Jupyter, use asyncio.run())\n",
+    "await search_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Custom Tool Creation\n",
+    "\n",
+    "Let's create a custom weather tool:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import TypedDict, Optional\n",
+    "from datetime import datetime\n",
+    "\n",
+    "# Define tool types\n",
+    "class WeatherInput(TypedDict):\n",
+    "    location: str\n",
+    "    date: Optional[str]\n",
+    "\n",
+    "class WeatherOutput(TypedDict):\n",
+    "    temperature: float\n",
+    "    conditions: str\n",
+    "    humidity: float\n",
+    "\n",
+    "class WeatherTool:\n",
+    "    \"\"\"Example custom tool for weather information.\"\"\"\n",
+    "\n",
+    "    def __init__(self, api_key: Optional[str] = None):\n",
+    "        self.api_key = api_key\n",
+    "\n",
+    "    async def get_weather(self, location: str, date: Optional[str] = None) -> WeatherOutput:\n",
+    "        \"\"\"Simulate getting weather data (replace with actual API call).\"\"\"\n",
+    "        # Mock implementation\n",
+    "        return {\n",
+    "            \"temperature\": 72.5,\n",
+    "            \"conditions\": \"partly cloudy\",\n",
+    "            \"humidity\": 65.0\n",
+    "        }\n",
+    "\n",
+    "    async def __call__(self, input_data: WeatherInput) -> WeatherOutput:\n",
+    "        \"\"\"Make the tool callable with structured input.\"\"\"\n",
+    "        return await self.get_weather(\n",
+    "            location=input_data[\"location\"],\n",
+    "            date=input_data.get(\"date\")\n",
+    "        )\n",
+    "\n",
+    "async def create_weather_agent(client: LlamaStackClient) -> Agent:\n",
+    "    \"\"\"Create an agent with weather tool capability.\"\"\"\n",
+    "    weather_tool = {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_weather\",\n",
+    "            \"description\": \"Get weather information for a location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"location\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"City or location name\"\n",
+    "                    },\n",
+    "                    \"date\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"Optional date (YYYY-MM-DD)\",\n",
+    "                        \"format\": \"date\"\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"location\"]\n",
+    "            }\n",
+    "        },\n",
+    "        \"implementation\": WeatherTool()\n",
+    "    }\n",
+    "\n",
+    "    return await create_tool_agent(\n",
+    "        client=client,\n",
+    "        tools=[weather_tool],\n",
+    "        instructions=\"\"\"\n",
+    "        You are a weather assistant that can provide weather information.\n",
+    "        Always specify the location clearly in your responses.\n",
+    "        Include both temperature and conditions in your summaries.\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "# Example usage\n",
+    "async def weather_example():\n",
+    "    client = LlamaStackClient(base_url=\"http://localhost:8000\")\n",
+    "    agent = await create_weather_agent(client)\n",
+    "\n",
+    "    session_id = agent.create_session(\"weather-session\")\n",
+    "\n",
+    "    queries = [\n",
+    "        \"What's the weather like in San Francisco?\",\n",
+    "        \"Tell me the weather in Tokyo tomorrow\",\n",
+    "    ]\n",
+    "\n",
+    "    for query in queries:\n",
+    "        print(f\"\\nQuery: {query}\")\n",
+    "        print(\"-\" * 50)\n",
+    "\n",
+    "        response = agent.create_turn(\n",
+    "            messages=[{\"role\": \"user\", \"content\": query}],\n",
+    "            session_id=session_id,\n",
+    "        )\n",
+    "\n",
+    "        async for log in EventLogger().log(response):\n",
+    "            log.print()\n",
+    "\n",
+    "# Run the example\n",
+    "await weather_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Tool Agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def create_multi_tool_agent(client: LlamaStackClient) -> Agent:\n",
+    "    \"\"\"Create an agent with multiple tools.\"\"\"\n",
+    "    tools = [\n",
+    "        # Brave Search tool\n",
+    "        AgentConfigToolSearchToolDefinition(\n",
+    "            type=\"brave_search\",\n",
+    "            engine=\"brave\",\n",
+    "            api_key=os.getenv(\"BRAVE_SEARCH_API_KEY\"),\n",
+    "        ),\n",
+    "        # Weather tool\n",
+    "        {\n",
+    "            \"type\": \"function\",\n",
+    "            \"function\": {\n",
+    "                \"name\": \"get_weather\",\n",
+    "                \"description\": \"Get weather information for a location\",\n",
+    "                \"parameters\": {\n",
+    "                    \"type\": \"object\",\n",
+    "                    \"properties\": {\n",
+    "                        \"location\": {\"type\": \"string\"},\n",
+    "                        \"date\": {\"type\": \"string\", \"format\": \"date\"}\n",
+    "                    },\n",
+    "                    \"required\": [\"location\"]\n",
+    "                }\n",
+    "            },\n",
+    "            \"implementation\": WeatherTool()\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    return await create_tool_agent(\n",
+    "        client=client,\n",
+    "        tools=tools,\n",
+    "        instructions=\"\"\"\n",
+    "        You are an assistant that can search the web and check weather information.\n",
+    "        Use the appropriate tool based on the user's question.\n",
+    "        For weather queries, always specify location and conditions.\n",
+    "        For web searches, always cite your sources.\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "# Interactive example with multi-tool agent\n",
+    "async def interactive_multi_tool():\n",
+    "    client = LlamaStackClient(base_url=\"http://localhost:8000\")\n",
+    "    agent = await create_multi_tool_agent(client)\n",
+    "    session_id = agent.create_session(\"interactive-session\")\n",
+    "\n",
+    "    print(\"🤖 Multi-tool Agent Ready! (type 'exit' to quit)\")\n",
+    "    print(\"Example questions:\")\n",
+    "    print(\"- What's the weather in Paris and what events are happening there?\")\n",
+    "    print(\"- Tell me about recent space discoveries and the weather on Mars\")\n",
+    "\n",
+    "    while True:\n",
+    "        query = input(\"\\nYour question: \")\n",
+    "        if query.lower() == 'exit':\n",
+    "            break\n",
+    "\n",
+    "        print(\"\\nThinking...\")\n",
+    "        try:\n",
+    "            response = agent.create_turn(\n",
+    "                messages=[{\"role\": \"user\", \"content\": query}],\n",
+    "                session_id=session_id,\n",
+    "            )\n",
+    "\n",
+    "            async for log in EventLogger().log(response):\n",
+    "                log.print()\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error: {e}\")\n",
+    "\n",
+    "# Run interactive example\n",
+    "await interactive_multi_tool()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Memory "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Getting Started with Memory API Tutorial 🚀\n",
+    "Welcome! This interactive tutorial will guide you through using the Memory API, a powerful tool for document storage and retrieval. Whether you're new to vector databases or an experienced developer, this notebook will help you understand the basics and get up and running quickly.\n",
+    "What you'll learn:\n",
+    "\n",
+    "How to set up and configure the Memory API client\n",
+    "Creating and managing memory banks (vector stores)\n",
+    "Different ways to insert documents into the system\n",
+    "How to perform intelligent queries on your documents\n",
+    "\n",
+    "Prerequisites:\n",
+    "\n",
+    "Basic Python knowledge\n",
+    "A running instance of the Memory API server (we'll use localhost in this tutorial)\n",
+    "\n",
+    "Let's start by installing the required packages:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the client library and a helper package for colored output\n",
+    "!pip install llama-stack-client termcolor\n",
+    "\n",
+    "# 💡 Note: If you're running this in a new environment, you might need to restart\n",
+    "# your kernel after installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Initial Setup\n",
+    "First, we'll import the necessary libraries and set up some helper functions. Let's break down what each import does:\n",
+    "\n",
+    "llama_stack_client: Our main interface to the Memory API\n",
+    "base64: Helps us encode files for transmission\n",
+    "mimetypes: Determines file types automatically\n",
+    "termcolor: Makes our output prettier with colors\n",
+    "\n",
+    "❓ Question: Why do we need to convert files to data URLs?\n",
+    "Answer: Data URLs allow us to embed file contents directly in our requests, making it easier to transmit files to the API without needing separate file uploads."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import json\n",
+    "import mimetypes\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack_client.types.memory_insert_params import Document\n",
+    "from termcolor import cprint\n",
+    "\n",
+    "# Helper function to convert files to data URLs\n",
+    "def data_url_from_file(file_path: str) -> str:\n",
+    "    \"\"\"Convert a file to a data URL for API transmission\n",
+    "\n",
+    "    Args:\n",
+    "        file_path (str): Path to the file to convert\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Data URL containing the file's contents\n",
+    "\n",
+    "    Example:\n",
+    "        >>> url = data_url_from_file('example.txt')\n",
+    "        >>> print(url[:30])  # Preview the start of the URL\n",
+    "        'data:text/plain;base64,SGVsbG8='\n",
+    "    \"\"\"\n",
+    "    if not os.path.exists(file_path):\n",
+    "        raise FileNotFoundError(f\"File not found: {file_path}\")\n",
+    "\n",
+    "    with open(file_path, \"rb\") as file:\n",
+    "        file_content = file.read()\n",
+    "\n",
+    "    base64_content = base64.b64encode(file_content).decode(\"utf-8\")\n",
+    "    mime_type, _ = mimetypes.guess_type(file_path)\n",
+    "\n",
+    "    data_url = f\"data:{mime_type};base64,{base64_content}\"\n",
+    "    return data_url"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2. Initialize Client and Create Memory Bank\n",
+    "Now we'll set up our connection to the Memory API and create our first memory bank. A memory bank is like a specialized database that stores document embeddings for semantic search.\n",
+    "❓ Key Concepts:\n",
+    "\n",
+    "embedding_model: The model used to convert text into vector representations\n",
+    "chunk_size: How large each piece of text should be when splitting documents\n",
+    "overlap_size: How much overlap between chunks (helps maintain context)\n",
+    "\n",
+    "✨ Pro Tip: Choose your chunk size based on your use case. Smaller chunks (256-512 tokens) are better for precise retrieval, while larger chunks (1024+ tokens) maintain more context."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure connection parameters\n",
+    "HOST = \"localhost\"  # Replace with your host if using a remote server\n",
+    "PORT = 8000        # Replace with your port if different\n",
+    "\n",
+    "# Initialize client\n",
+    "client = LlamaStackClient(\n",
+    "    base_url=f\"http://{HOST}:{PORT}\",\n",
+    ")\n",
+    "\n",
+    "# Let's see what providers are available\n",
+    "# Providers determine where and how your data is stored\n",
+    "providers = client.providers.list()\n",
+    "print(\"Available providers:\")\n",
+    "print(json.dumps(providers, indent=2))\n",
+    "\n",
+    "# Create a memory bank with optimized settings for general use\n",
+    "client.memory_banks.register(\n",
+    "    memory_bank={\n",
+    "        \"identifier\": \"tutorial_bank\",  # A unique name for your memory bank\n",
+    "        \"embedding_model\": \"all-MiniLM-L6-v2\",  # A lightweight but effective model\n",
+    "        \"chunk_size_in_tokens\": 512,  # Good balance between precision and context\n",
+    "        \"overlap_size_in_tokens\": 64,  # Helps maintain context between chunks\n",
+    "        \"provider_id\": providers[\"memory\"][0].provider_id,  # Use the first available provider\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Let's verify our memory bank was created\n",
+    "memory_banks = client.memory_banks.list()\n",
+    "print(\"\\nRegistered memory banks:\")\n",
+    "print(json.dumps(memory_banks, indent=2))\n",
+    "\n",
+    "# 🎯 Exercise: Try creating another memory bank with different settings!\n",
+    "# What happens if you try to create a bank with the same identifier?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3. Insert Documents\n",
+    "The Memory API supports multiple ways to add documents. We'll demonstrate two common approaches:\n",
+    "\n",
+    "Loading documents from URLs\n",
+    "Loading documents from local files\n",
+    "\n",
+    "❓ Important Concepts:\n",
+    "\n",
+    "Each document needs a unique document_id\n",
+    "Metadata helps organize and filter documents later\n",
+    "The API automatically processes and chunks documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example URLs to documentation\n",
+    "# 💡 Replace these with your own URLs or use the examples\n",
+    "urls = [\n",
+    "    \"memory_optimizations.rst\",\n",
+    "    \"chat.rst\",\n",
+    "    \"llama3.rst\",\n",
+    "]\n",
+    "\n",
+    "# Create documents from URLs\n",
+    "# We add metadata to help organize our documents\n",
+    "url_documents = [\n",
+    "    Document(\n",
+    "        document_id=f\"url-doc-{i}\",  # Unique ID for each document\n",
+    "        content=f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
+    "        mime_type=\"text/plain\",\n",
+    "        metadata={\"source\": \"url\", \"filename\": url},  # Metadata helps with organization\n",
+    "    )\n",
+    "    for i, url in enumerate(urls)\n",
+    "]\n",
+    "\n",
+    "# Example with local files\n",
+    "# 💡 Replace these with your actual files\n",
+    "local_files = [\"example.txt\", \"readme.md\"]\n",
+    "file_documents = [\n",
+    "    Document(\n",
+    "        document_id=f\"file-doc-{i}\",\n",
+    "        content=data_url_from_file(path),\n",
+    "        metadata={\"source\": \"local\", \"filename\": path},\n",
+    "    )\n",
+    "    for i, path in enumerate(local_files)\n",
+    "    if os.path.exists(path)\n",
+    "]\n",
+    "\n",
+    "# Combine all documents\n",
+    "all_documents = url_documents + file_documents\n",
+    "\n",
+    "# Insert documents into memory bank\n",
+    "response = client.memory.insert(\n",
+    "    bank_id=\"tutorial_bank\",\n",
+    "    documents=all_documents,\n",
+    ")\n",
+    "\n",
+    "print(\"Documents inserted successfully!\")\n",
+    "\n",
+    "# 🎯 Exercise: Try adding your own documents!\n",
+    "# - What happens if you try to insert a document with an existing ID?\n",
+    "# - What other metadata might be useful to add?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "4. Query the Memory Bank\n",
+    "Now for the exciting part - querying our documents! The Memory API uses semantic search to find relevant content based on meaning, not just keywords.\n",
+    "❓ Understanding Scores:\n",
+    "\n",
+    "Scores range from 0 to 1, with 1 being the most relevant\n",
+    "Generally, scores above 0.7 indicate strong relevance\n",
+    "Consider your use case when deciding on score thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_query_results(query: str):\n",
+    "    \"\"\"Helper function to print query results in a readable format\n",
+    "\n",
+    "    Args:\n",
+    "        query (str): The search query to execute\n",
+    "    \"\"\"\n",
+    "    print(f\"\\nQuery: {query}\")\n",
+    "    print(\"-\" * 50)\n",
+    "\n",
+    "    response = client.memory.query(\n",
+    "        bank_id=\"tutorial_bank\",\n",
+    "        query=[query],  # The API accepts multiple queries at once!\n",
+    "    )\n",
+    "\n",
+    "    for i, (chunk, score) in enumerate(zip(response.chunks, response.scores)):\n",
+    "        print(f\"\\nResult {i+1} (Score: {score:.3f})\")\n",
+    "        print(\"=\" * 40)\n",
+    "        print(chunk)\n",
+    "        print(\"=\" * 40)\n",
+    "\n",
+    "# Let's try some example queries\n",
+    "queries = [\n",
+    "    \"How do I use LoRA?\",  # Technical question\n",
+    "    \"Tell me about memory optimizations\",  # General topic\n",
+    "    \"What are the key features of Llama 3?\"  # Product-specific\n",
+    "]\n",
+    "\n",
+    "for query in queries:\n",
+    "    print_query_results(query)\n",
+    "\n",
+    "# 🎯 Exercises:\n",
+    "# 1. Try writing your own queries! What works well? What doesn't?\n",
+    "# 2. How do different phrasings of the same question affect results?\n",
+    "# 3. What happens if you query for content that isn't in your documents?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "5. Advanced Usage: Query with Metadata Filtering\n",
+    "One powerful feature is the ability to filter results based on metadata. This helps when you want to search within specific subsets of your documents.\n",
+    "❓ Use Cases for Metadata Filtering:\n",
+    "\n",
+    "Search within specific document types\n",
+    "Filter by date ranges\n",
+    "Limit results to certain authors or sources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Query with metadata filter\n",
+    "response = client.memory.query(\n",
+    "    bank_id=\"tutorial_bank\",\n",
+    "    query=[\"Tell me about optimization\"],\n",
+    "    metadata_filter={\"source\": \"url\"}  # Only search in URL documents\n",
+    ")\n",
+    "\n",
+    "print(\"\\nFiltered Query Results:\")\n",
+    "print(\"-\" * 50)\n",
+    "for chunk, score in zip(response.chunks, response.scores):\n",
+    "    print(f\"Score: {score:.3f}\")\n",
+    "    print(f\"Chunk:\\n{chunk}\\n\")\n",
+    "\n",
+    "# 🎯 Advanced Exercises:\n",
+    "# 1. Try combining multiple metadata filters\n",
+    "# 2. Compare results with and without filters\n",
+    "# 3. What happens with non-existent metadata fields?"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -4,32 +4,6 @@
    max-width: 90%;
 }

-.wy-nav-side {
-    /* background: linear-gradient(45deg, #2980B9, #16A085); */
-    background: linear-gradient(90deg, #332735, #1b263c);
+.wy-side-nav-search, .wy-nav-top {
+    background: #666666;
 }
-
-.wy-side-nav-search {
-    background-color: transparent !important;
-}
-
-.hide-title h1 {
-    display: none;
-}
-
-h2, h3, h4 {
-    font-weight: normal;
-}
-html[data-theme="dark"] .rst-content div[class^="highlight"] {
-  background-color: #0b0b0b;
-}
-pre {
-    white-space: pre-wrap !important;
-    word-break: break-all;
-}
-
-[data-theme="dark"] .mermaid {
-    background-color: #f4f4f6 !important;
-    border-radius: 6px;
-    padding: 0.5em;
-  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
-document.addEventListener("DOMContentLoaded", function () {
-  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-  const htmlElement = document.documentElement;
-
-  // Check if theme is saved in localStorage
-  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
-
-  if (savedTheme) {
-    // Use the saved theme preference
-    htmlElement.setAttribute("data-theme", savedTheme);
-    document.body.classList.toggle("dark", savedTheme === "dark");
-  } else {
-    // Fall back to system preference
-    const theme = prefersDark ? "dark" : "light";
-    htmlElement.setAttribute("data-theme", theme);
-    document.body.classList.toggle("dark", theme === "dark");
-    // Save initial preference
-    localStorage.setItem("sphinx-rtd-theme", theme);
-  }
-
-  // Listen for theme changes from the existing toggle
-  const observer = new MutationObserver(function(mutations) {
-    mutations.forEach(function(mutation) {
-      if (mutation.attributeName === "data-theme") {
-        const currentTheme = htmlElement.getAttribute("data-theme");
-        localStorage.setItem("sphinx-rtd-theme", currentTheme);
-      }
-    });
-  });
-
-  observer.observe(htmlElement, { attributes: true });
-});
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
-document.addEventListener('keydown', function(event) {
-  // command+K or ctrl+K
-  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-
-  // forward slash
-  if (event.key === '/' &&
-      !event.target.matches('input, textarea, select')) {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/_static/llama-stack.png
+++ b/docs/_static/llama-stack.png
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import time
-
-
-def pytest_collection_modifyitems(items):
-    for item in items:
-        item.name = item.name.replace(' ', '_') 
-
-
-def pytest_runtest_teardown(item):
-    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
-    if interval_seconds:
-        time.sleep(float(interval_seconds))
-
-
-def pytest_configure(config):
-    config.option.tbstyle = "short"
-    config.option.disable_warnings = True
--- a/docs/contbuild.sh
+++ b/docs/contbuild.sh
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-sphinx-autobuild --write-all source build/html --watch source/
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/bedrock/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/fireworks/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/meta-reference-gpu/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/ollama/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/remote-vllm/build.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/tgi/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/together/build.yaml`