build: Bump version to 0.2.13

Release candidate 0.2.13rc2
2025-08-21 09:23:13 +00:00 · 2025-06-27 23:55:31 +00:00 · 2025-06-27 23:12:35 +00:00
1108 changed files with 40171 additions and 102182 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -4,9 +4,3 @@ omit =
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
-    */llama_stack/cli/scripts/*
-    */llama_stack/ui/*
-    */llama_stack/distribution/ui/*
-    */llama_stack/strong_typing/*
-    */llama_stack/env.py
-    */__init__.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist
--- a/.github/ISSUE_TEMPLATE/tech-debt.yml
+++ b/.github/ISSUE_TEMPLATE/tech-debt.yml
@ -1,30 +0,0 @@
-name: 🔧 Tech Debt
-description: Something that is functional but should be improved or optimizied
-labels: ["tech-debt"]
-body:
- type: textarea
-  id: tech-debt-explanation
-  attributes:
-    label: 🤔 What is the technical debt you think should be addressed?
-    description: >
-      A clear and concise description of _what_ needs to be addressed - ensure you are describing
-      constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
-      or feature request.
-  validations:
-    required: true
-
- type: textarea
-  id: tech-debt-motivation
-  attributes:
-    label: 💡 What is the benefit of addressing this technical debt?
-    description: >
-      A clear and concise description of _why_ this work is needed.
-  validations:
-    required: true
-
- type: textarea
-  id: other-thoughts
-  attributes:
-    label: Other thoughts
-    description: >
-      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
+ @bbrowning @franciscojavierarceo @leseb
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -1,88 +0,0 @@
-name: 'Run and Record Tests'
-description: 'Run integration tests and handle recording/artifact upload'
-
-inputs:
-  test-subdirs:
-    description: 'Comma-separated list of test subdirectories to run'
-    required: true
-  test-pattern:
-    description: 'Regex pattern to pass to pytest -k'
-    required: false
-    default: ''
-  stack-config:
-    description: 'Stack configuration to use'
-    required: true
-  provider:
-    description: 'Provider to use for tests'
-    required: true
-  inference-mode:
-    description: 'Inference mode (record or replay)'
-    required: true
-  run-vision-tests:
-    description: 'Whether to run vision tests'
-    required: false
-    default: 'false'
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Check Storage and Memory Available Before Tests
-      if: ${{ always() }}
-      shell: bash
-      run: |
-        free -h
-        df -h
-
-    - name: Run Integration Tests
-      shell: bash
-      run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
-          --stack-config '${{ inputs.stack-config }}' \
-          --provider '${{ inputs.provider }}' \
-          --test-subdirs '${{ inputs.test-subdirs }}' \
-          --test-pattern '${{ inputs.test-pattern }}' \
-          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
-          | tee pytest-${{ inputs.inference-mode }}.log
-
-
-    - name: Commit and push recordings
-      if: ${{ inputs.inference-mode == 'record' }}
-      shell: bash
-      run: |
-        echo "Checking for recording changes"
-        git status --porcelain tests/integration/recordings/
-
-        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
-          echo "New recordings detected, committing and pushing"
-          git add tests/integration/recordings/
-
-          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-            git commit -m "Recordings update from CI (vision)"
-          else
-            git commit -m "Recordings update from CI"
-          fi
-
-          git fetch origin ${{ github.ref_name }}
-          git rebase origin/${{ github.ref_name }}
-          echo "Rebased successfully"
-          git push origin HEAD:${{ github.ref_name }}
-          echo "Pushed successfully"
-        else
-          echo "No recording changes"
-        fi
-
-    - name: Write inference logs to file
-      if: ${{ always() }}
-      shell: bash
-      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
-
-    - name: Upload logs
-      if: ${{ always() }}
-      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-      with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
-        path: |
-          *.log
-        retention-days: 1
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,23 +1,9 @@
 name: Setup Ollama
 description: Start Ollama
-inputs:
-  run-vision-tests:
-    description: 'Run vision tests: "true" or "false"'
-    required: false
-    default: 'false'
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-          image="ollama-with-vision-model"
-        else
-          image="ollama-with-models"
-        fi
-
-        echo "Starting Ollama with image: $image"
-        docker run -d --name ollama -p 11434:11434 docker.io/llamastack/$image
-        echo "Verifying Ollama status..."
-        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
+        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -5,10 +5,6 @@ inputs:
    description: The Python version to use
    required: false
    default: "3.12"
-  client-version:
-    description: The llama-stack-client-python version to test against (latest or published)
-    required: false
-    default: "latest"
 runs:
  using: "composite"
  steps:
@ -16,28 +12,16 @@ runs:
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: ${{ inputs.python-version }}
+        activate-environment: true
        version: 0.7.6

    - name: Install dependencies
      shell: bash
      run: |
-        echo "Updating project dependencies via uv sync"
        uv sync --all-groups
-
-        echo "Installing ad-hoc dependencies"
-        uv pip install faiss-cpu
-
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          uv pip install llama-stack-client
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
-        echo "Installed llama packages"
-        uv pip list | grep llama
+        uv pip install ollama faiss-cpu
+        # always test against the latest version of the client
+        # TODO: this is not necessarily a good idea. we need to test against both published and latest
+        # to find out backwards compatibility issues.
+        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        uv pip install -e .
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -1,66 +0,0 @@
-name: 'Setup Test Environment'
-description: 'Common setup steps for integration tests including dependencies, providers, and build'
-
-inputs:
-  python-version:
-    description: 'Python version to use'
-    required: true
-  client-version:
-    description: 'Client version (latest or published)'
-    required: true
-  provider:
-    description: 'Provider to setup (ollama or vllm)'
-    required: true
-    default: 'ollama'
-  run-vision-tests:
-    description: 'Whether to setup provider for vision tests'
-    required: false
-    default: 'false'
-  inference-mode:
-    description: 'Inference mode (record or replay)'
-    required: true
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Install dependencies
-      uses: ./.github/actions/setup-runner
-      with:
-        python-version: ${{ inputs.python-version }}
-        client-version: ${{ inputs.client-version }}
-
-    - name: Setup ollama
-      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
-      uses: ./.github/actions/setup-ollama
-      with:
-        run-vision-tests: ${{ inputs.run-vision-tests }}
-
-    - name: Setup vllm
-      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
-      uses: ./.github/actions/setup-vllm
-
-    - name: Build Llama Stack
-      shell: bash
-      run: |
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          unset LLAMA_STACK_CLIENT_DIR
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
-        echo "Building Llama Stack"
-
-        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
-
-    - name: Configure git for commits
-      shell: bash
-      run: |
-        git config --local user.email "github-actions[bot]@users.noreply.github.com"
-        git config --local user.name "github-actions[bot]"
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -1,27 +0,0 @@
-name: Setup VLLM
-description: Start VLLM
-runs:
-  using: "composite"
-  steps:
-    - name: Start VLLM
-      shell: bash
-      run: |
-        # Start vllm container
-        docker run -d \
-          --name vllm \
-          -p 8000:8000 \
-          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
-          --host 0.0.0.0 \
-          --port 8000 \
-          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
-
-          # Wait for vllm to be ready
-          echo "Waiting for vllm to be ready..."
-          timeout 900 bash -c 'until curl -f http://localhost:8000/health; do
-            echo "Waiting for vllm..."
-            sleep 5
-          done'
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,25 +9,15 @@ updates:
      day: "saturday"
    commit-message:
      prefix: chore(github-deps)
-
  - package-ecosystem: "uv"
    directory: "/"
    schedule:
      interval: "weekly"
      day: "saturday"
+    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
+    open-pull-requests-limit: 0
    labels:
      - type/dependencies
      - python
    commit-message:
      prefix: chore(python-deps)
-
-  - package-ecosystem: npm
-    directory: "/llama_stack/ui"
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    labels:
-      - type/dependencies
-      - javascript
-    commit-message:
-      prefix: chore(ui-deps)
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -1,23 +0,0 @@
-# Llama Stack CI
-
-Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
-
-| Name | File | Purpose |
-| ---- | ---- | ------- |
-| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
-| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
-| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
-| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
-| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
-| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
-| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
-| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
-| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
-| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
-| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
-| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
-| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
-| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
-| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
-| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -1,7 +1,5 @@
 name: Update Changelog

-run-name: Creates PR for updating the CHANGELOG.md
-
 on:
  release:
    types: [published, unpublished, created, edited, deleted, released]
@ -17,7 +15,7 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: main
          fetch-depth: 0
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@ -0,0 +1,355 @@
+name: "Run Llama-stack Tests"
+
+on:
+  #### Temporarily disable PR runs until tests run as intended within mainline.
+  #TODO Add this back.
+  #pull_request_target:
+  #  types: ["opened"]
+  #  branches:
+  #    - 'main'
+  #  paths:
+  #    - 'llama_stack/**/*.py'
+  #    - 'tests/**/*.py'
+
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: 'GHA Runner Scale Set label to run workflow on.'
+        required: true
+        default: "llama-stack-gha-runner-gpu"
+
+      checkout_reference:
+        description: "The branch, tag, or SHA to checkout"
+        required: true
+        default: "main"
+
+      debug:
+        description: 'Run debugging steps?'
+        required: false
+        default: "true"
+
+      sleep_time:
+        description: '[DEBUG] sleep time for debugging'
+        required: true
+        default: "0"
+
+      provider_id:
+        description: 'ID of your provider'
+        required: true
+        default: "meta_reference"
+
+      model_id:
+        description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
+        required: true
+        default: "llama_3b"
+
+      model_override_3b:
+        description: 'Specify shorthand model for <llama_3b> '
+        required: false
+        default: "Llama3.2-3B-Instruct"
+
+      model_override_8b:
+        description: 'Specify shorthand model for <llama_8b> '
+        required: false
+        default: "Llama3.1-8B-Instruct"
+
+env:
+  # ID used for each test's provider config
+  PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
+
+  # Path to model checkpoints within EFS volume
+  MODEL_CHECKPOINT_DIR: "/data/llama"
+
+  # Path to directory to run tests from
+  TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
+
+  # Keep track of a list of model IDs that are valid to use within pytest fixture marks
+  AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
+
+  # Shorthand name for model ID, used in pytest fixture marks
+  MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
+
+  # Override the `llama_3b` / `llama_8b' models, else use the default.
+  LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
+  LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
+
+  # Defines which directories in TESTS_PATH to exclude from the test loop
+  EXCLUDED_DIRS: "__pycache__"
+
+  # Defines the output xml reports generated after a test is run
+  REPORTS_GEN: ""
+
+jobs:
+  execute_workflow:
+    name: Execute workload on Self-Hosted GPU k8s runner
+    permissions:
+      pull-requests: write
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
+    if: always()
+    steps:
+
+      ##############################
+      #### INITIAL DEBUG CHECKS ####
+      ##############################
+      - name: "[DEBUG] Check content of the EFS mount"
+        id: debug_efs_volume
+        continue-on-error: true
+        if: inputs.debug == 'true'
+        run: |
+            echo "========= Content of the EFS mount ============="
+            ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
+
+      - name: "[DEBUG] Get runner container OS information"
+        id: debug_os_info
+        if: ${{ inputs.debug == 'true' }}
+        run: |
+            cat /etc/os-release
+
+      - name: "[DEBUG] Print environment variables"
+        id: debug_env_vars
+        if: ${{ inputs.debug == 'true' }}
+        run: |
+            echo "PROVIDER_ID = ${PROVIDER_ID}"
+            echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
+            echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
+            echo "MODEL_ID = ${MODEL_ID}"
+            echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
+            echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
+            echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
+            echo "REPORTS_GEN = ${REPORTS_GEN}"
+
+      ############################
+      #### MODEL INPUT CHECKS ####
+      ############################
+
+      - name: "Check if env.model_id is valid"
+        id: check_model_id
+        run: |
+          if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
+            echo "Model ID '${MODEL_ID}' is valid."
+          else
+            echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
+            exit 1
+          fi
+
+      #######################
+      #### CODE CHECKOUT ####
+      #######################
+      - name: "Checkout 'meta-llama/llama-stack' repository"
+        id: checkout_repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: ${{ inputs.branch }}
+
+      - name: "[DEBUG] Content of the repository after checkout"
+        id: debug_content_after_checkout
+        if: ${{ inputs.debug == 'true' }}
+        run: |
+            ls -la ${GITHUB_WORKSPACE}
+
+      ##########################################################
+      ####              OPTIONAL SLEEP DEBUG                ####
+      #                                                        #
+      # Use to "exec" into the test k8s POD and run tests      #
+      # manually to identify what dependencies are being used. #
+      #                                                        #
+      ##########################################################
+      - name: "[DEBUG] sleep"
+        id: debug_sleep
+        if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
+        run: |
+            sleep ${{ inputs.sleep_time }}
+
+      ############################
+      #### UPDATE SYSTEM PATH ####
+      ############################
+      - name: "Update path: execute"
+        id: path_update_exec
+        run: |
+          # .local/bin is needed for certain libraries installed below to be recognized
+          # when calling their executable to install sub-dependencies
+          mkdir -p ${HOME}/.local/bin
+          echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
+
+      #####################################
+      #### UPDATE CHECKPOINT DIRECTORY ####
+      #####################################
+      - name: "Update checkpoint directory"
+        id: checkpoint_update
+        run: |
+          echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
+          if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
+            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
+          elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
+            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
+          else
+            echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
+            exit 1
+          fi
+
+      - name: "[DEBUG] Checkpoint update check"
+        id: debug_checkpoint_update
+        if: ${{ inputs.debug == 'true' }}
+        run: |
+          echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
+
+      ##################################
+      #### DEPENDENCY INSTALLATIONS ####
+      ##################################
+      - name: "Installing 'apt' required packages"
+        id: install_apt
+        run: |
+          echo "[STEP] Installing 'apt' required packages"
+          sudo apt update -y
+          sudo apt install -y python3 python3-pip npm wget
+
+      - name: "Installing packages with 'curl'"
+        id: install_curl
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+
+      - name: "Installing packages with 'wget'"
+        id: install_wget
+        run: |
+          wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+          chmod +x Miniconda3-latest-Linux-x86_64.sh
+          ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
+          # Add miniconda3 bin to system path
+          echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
+
+      - name: "Installing packages with 'npm'"
+        id: install_npm_generic
+        run: |
+          sudo npm install -g junit-merge
+
+      - name: "Installing pip dependencies"
+        id: install_pip_generic
+        run: |
+          echo "[STEP] Installing 'llama-stack' models"
+          pip install -U pip setuptools
+          pip install -r requirements.txt
+          pip install -e .
+          pip install -U \
+            torch torchvision \
+            pytest pytest_asyncio \
+            fairscale lm-format-enforcer \
+            zmq chardet pypdf \
+            pandas sentence_transformers together \
+            aiosqlite
+      - name: "Installing packages with conda"
+        id: install_conda_generic
+        run: |
+          conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
+
+      #############################################################
+      #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
+      #############################################################
+      - name: "Run Tests: Loop"
+        id: run_tests_loop
+        working-directory: "${{ github.workspace }}"
+        run: |
+          pattern=""
+          for dir in llama_stack/providers/tests/*; do
+            if [ -d "$dir" ]; then
+              dir_name=$(basename "$dir")
+              if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
+                for file in "$dir"/test_*.py; do
+                  test_name=$(basename "$file")
+                  new_file="result-${dir_name}-${test_name}.xml"
+                  if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
+                     --junitxml="${{ github.workspace }}/${new_file}"; then
+                    echo "Ran test: ${test_name}"
+                  else
+                    echo "Did NOT run test: ${test_name}"
+                  fi
+                  pattern+="${new_file} "
+                done
+              fi
+            fi
+          done
+          echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
+
+      - name: "Test Summary: Merge"
+        id: test_summary_merge
+        working-directory: "${{ github.workspace }}"
+        run: |
+          echo "Merging the following test result files: ${REPORTS_GEN}"
+          # Defaults to merging them into 'merged-test-results.xml'
+          junit-merge ${{ env.REPORTS_GEN }}
+
+      ############################################
+      #### AUTOMATIC TESTING ON PULL REQUESTS ####
+      ############################################
+
+      #### Run tests ####
+
+      - name: "PR - Run Tests"
+        id: pr_run_tests
+        working-directory: "${{ github.workspace }}"
+        if: github.event_name == 'pull_request_target'
+        run: |
+          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
+          # (Optional) Add more tests here.
+
+          # Merge test results with 'merged-test-results.xml' from above.
+          # junit-merge <new-test-results> merged-test-results.xml
+
+      #### Create test summary ####
+
+      - name: "PR - Test Summary"
+        id: pr_test_summary_create
+        if: github.event_name == 'pull_request_target'
+        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        with:
+          paths: "${{ github.workspace }}/merged-test-results.xml"
+          output: test-summary.md
+
+      - name: "PR - Upload Test Summary"
+        id: pr_test_summary_upload
+        if: github.event_name == 'pull_request_target'
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: test-summary
+          path: test-summary.md
+
+      #### Update PR request ####
+
+      - name: "PR - Update comment"
+        id: pr_update_comment
+        if: github.event_name == 'pull_request_target'
+        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
+        with:
+          filePath: test-summary.md
+
+      ########################
+      #### MANUAL TESTING ####
+      ########################
+
+      #### Run tests ####
+
+      - name: "Manual - Run Tests: Prep"
+        id: manual_run_tests
+        working-directory: "${{ github.workspace }}"
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
+
+          #TODO Use this when collection errors are resolved
+          # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
+
+          # (Optional) Add more tests here.
+
+          # Merge test results with 'merged-test-results.xml' from above.
+          # junit-merge <new-test-results> merged-test-results.xml
+
+      #### Create test summary ####
+
+      - name: "Manual - Test Summary"
+        id: manual_test_summary
+        if: always() && github.event_name == 'workflow_dispatch'
+        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        with:
+          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -1,14 +1,12 @@
 name: Installer CI

-run-name: Test the installation script
-
 on:
  pull_request:
    paths:
-      - 'scripts/install.sh'
+      - 'install.sh'
  push:
    paths:
-      - 'scripts/install.sh'
+      - 'install.sh'
  schedule:
    - cron: '0 2 * * *'  # every day at 02:00 UTC

@ -16,24 +14,13 @@ jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
-        run: shellcheck scripts/install.sh
-  smoke-test-on-dev:
+        run: shellcheck install.sh
+  smoke-test:
+    needs: lint
    runs-on: ubuntu-latest
    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
-            llama stack build --template starter --image-type container --image-name test
-
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run installer end-to-end
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          ./scripts/install.sh --image $IMAGE_ID
+        run: ./install.sh
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -1,7 +1,5 @@
 name: Integration Auth Tests

-run-name: Run the integration test suite with Kubernetes authentication
-
 on:
  push:
    branches: [ main ]
@ -10,7 +8,6 @@ on:
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -31,14 +28,14 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20
+        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19

      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
@ -76,12 +73,9 @@ jobs:
          server:
            port: 8321
          EOF
-          yq eval '.server.auth.provider_config.type = "${{ matrix.auth-provider }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.tls_cafile = "${{ env.KUBERNETES_CA_CERT_PATH }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.issuer = "${{ env.KUBERNETES_ISSUER }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.audience = "${{ env.KUBERNETES_AUDIENCE }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.jwks.uri = "${{ env.KUBERNETES_API_SERVER_URL }}"' -i $run_dir/run.yaml
-          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}", "token": "${{ env.TOKEN }}"}' -i $run_dir/run.yaml
          cat $run_dir/run.yaml

          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -1,72 +0,0 @@
-name: SqlStore Integration Tests
-
-run-name: Run the integration test suite with SqlStore
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/providers/utils/sqlstore/**'
-      - 'tests/integration/sqlstore/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-sql-store-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12", "3.13"]
-      fail-fast: false
-
-    services:
-      postgres:
-        image: postgres:15
-        env:
-          POSTGRES_USER: llamastack
-          POSTGRES_PASSWORD: llamastack
-          POSTGRES_DB: llamastack
-        ports:
-          - 5432:5432
-        options: >-
-          --health-cmd pg_isready
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Run SqlStore Integration Tests
-        env:
-          ENABLE_POSTGRES_TESTS: "true"
-          POSTGRES_HOST: localhost
-          POSTGRES_PORT: 5432
-          POSTGRES_DB: llamastack
-          POSTGRES_USER: llamastack
-          POSTGRES_PASSWORD: llamastack
-        run: |
-          uv run pytest -sv tests/integration/providers/utils/sqlstore/
-
-      - name: Upload test logs
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,87 +1,120 @@
-name: Integration Tests (Replay)
-
-run-name: Run the integration test suite from tests/integration in replay mode
+name: Integration Tests

 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
-    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/**'
+      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
+      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
-      - '.github/actions/setup-ollama/action.yml'
-      - '.github/actions/setup-test-environment/action.yml'
-      - '.github/actions/run-and-record-tests/action.yml'
-  schedule:
-    # If changing the cron schedule, update the provider in the test-matrix job
-    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
-  workflow_dispatch:
-    inputs:
-      test-all-client-versions:
-        description: 'Test against both the latest and published versions'
-        type: boolean
-        default: false
-      test-provider:
-        description: 'Test against a specific provider'
-        type: string
-        default: 'ollama'
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''

 concurrency:
-  # Skip concurrency for pushes to main - each commit should be tested independently
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 jobs:
-
-  run-replay-mode-tests:
+  test-matrix:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
-
    strategy:
-      fail-fast: false
      matrix:
-        client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
-        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
-        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        run-vision-tests: [true, false]
+        # Listing tests manually since some of them currently fail
+        # TODO: generate matrix list from tests/integration when fixed
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io]
+        client-type: [library, http]
+        python-version: ["3.12", "3.13"]
+      fail-fast: false # we want to run all tests regardless of failure

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Setup test environment
-        uses: ./.github/actions/setup-test-environment
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python-version }}
-          client-version: ${{ matrix.client-version }}
-          provider: ${{ matrix.provider }}
-          run-vision-tests: ${{ matrix.run-vision-tests }}
-          inference-mode: 'replay'

-      - name: Run tests
-        uses: ./.github/actions/run-and-record-tests
+      - name: Setup ollama
+        uses: ./.github/actions/setup-ollama
+
+      - name: Build Llama Stack
+        run: |
+          uv run llama stack build --template ollama --image-type venv
+
+      - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
+
+      - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Verify Ollama status is OK
+        if: matrix.client-type == 'http'
+        run: |
+          echo "Verifying Ollama status..."
+          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
+          echo "Ollama status: $ollama_status"
+          if [ "$ollama_status" != "OK" ]; then
+            echo "Ollama health check failed"
+            exit 1
+          fi
+
+      - name: Check Storage and Memory Available Before Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          OLLAMA_URL: "http://0.0.0.0:11434"
+        run: |
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="ollama"
+          else
+            stack_config="http://localhost:8321"
+          fi
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --embedding-model=all-MiniLM-L6-v2
+
+      - name: Check Storage and Memory Available After Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Write ollama logs to file
+        if: ${{ always() }}
+        run: |
+          sudo docker logs ollama > ollama.log
+
+      - name: Upload all logs to artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          test-subdirs: ${{ inputs.test-subdirs }}
-          test-pattern: ${{ inputs.test-pattern }}
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          provider: ${{ matrix.provider }}
-          inference-mode: 'replay'
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
+          path: |
+            *.log
+          retention-days: 1
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -1,7 +1,5 @@
 name: Vector IO Integration Tests

-run-name: Run the integration test suite with various VectorIO providers
-
 on:
  push:
    branches: [ main ]
@ -9,17 +7,14 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-vector-io-tests.yml' # This workflow
-  schedule:
-    - cron: '0 0 * * *'  # (test on python 3.13) Daily at 12 AM UTC

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 jobs:
@ -27,13 +22,13 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
+        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "remote::chromadb", "remote::pgvector"]
+        python-version: ["3.12", "3.13"]
      fail-fast: false # we want to run all tests regardless of failure

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -51,14 +46,6 @@ jobs:
            -e ANONYMIZED_TELEMETRY=FALSE \
            chromadb/chroma:latest

-      - name: Setup Weaviate
-        if: matrix.vector-io-provider == 'remote::weaviate'
-        run: |
-          docker run --rm -d --pull always \
-          --name weaviate \
-          -p 8080:8080 -p 50051:50051 \
-          cr.weaviate.io/semitechnologies/weaviate:1.32.0
-
      - name: Start PGVector DB
        if: matrix.vector-io-provider == 'remote::pgvector'
        run: |
@ -89,29 +76,6 @@ jobs:
          PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
            -c "CREATE EXTENSION IF NOT EXISTS vector;"

-      - name: Setup Qdrant
-        if: matrix.vector-io-provider == 'remote::qdrant'
-        run: |
-          docker run --rm -d --pull always \
-            --name qdrant \
-            -p 6333:6333 \
-            qdrant/qdrant
-
-      - name: Wait for Qdrant to be ready
-        if: matrix.vector-io-provider == 'remote::qdrant'
-        run: |
-          echo "Waiting for Qdrant to be ready..."
-          for i in {1..30}; do
-            if curl -s http://localhost:6333/collections | grep -q '"status":"ok"'; then
-              echo "Qdrant is ready!"
-              exit 0
-            fi
-            sleep 2
-          done
-          echo "Qdrant failed to start"
-          docker logs qdrant
-          exit 1
-
      - name: Wait for ChromaDB to be ready
        if: matrix.vector-io-provider == 'remote::chromadb'
        run: |
@ -127,24 +91,9 @@ jobs:
          docker logs chromadb
          exit 1

-      - name: Wait for Weaviate to be ready
-        if: matrix.vector-io-provider == 'remote::weaviate'
-        run: |
-          echo "Waiting for Weaviate to be ready..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8080 | grep -q "https://weaviate.io/developers/weaviate/current/"; then
-              echo "Weaviate is ready!"
-              exit 0
-            fi
-            sleep 2
-          done
-          echo "Weaviate failed to start"
-          docker logs weaviate
-          exit 1
-
      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run llama stack build --template starter --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -162,15 +111,10 @@ jobs:
          PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
-          ENABLE_QDRANT: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'true' || '' }}
-          QDRANT_URL: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'http://localhost:6333' || '' }}
-          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
-          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
-          uv run --no-sync \
-            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
+          uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
-            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
+            --embedding-model all-MiniLM-L6-v2

      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
@ -188,11 +132,6 @@ jobs:
        run: |
          docker logs chromadb > chromadb.log

-      - name: Write Qdrant logs to file
-        if: ${{ always() && matrix.vector-io-provider == 'remote::qdrant' }}
-        run: |
-          docker logs qdrant > qdrant.log
-
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,7 +1,5 @@
 name: Pre-commit

-run-name: Run pre-commit checks
-
 on:
  pull_request:
  push:
@ -14,18 +12,10 @@ concurrency:
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write

    steps:
      - name: Checkout code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          # For dependabot PRs, we need to checkout with a token that can push changes
-          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
-          # Fetch full history for dependabot PRs to allow commits
-          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@ -36,61 +26,16 @@ jobs:
            **/requirements*.txt
            .pre-commit-config.yaml

-      # npm ci may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-
-      # - name: Set up Node.js
-      #   uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
-      #   with:
-      #     node-version: '20'
-      #     cache: 'npm'
-      #     cache-dependency-path: 'llama_stack/ui/'
-
-      # - name: Install npm dependencies
-      #   run: npm ci
-      #   working-directory: llama_stack/ui
-
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github

-      - name: Debug
-        run: |
-          echo "github.ref: ${{ github.ref }}"
-          echo "github.actor: ${{ github.actor }}"
-
-      - name: Commit changes for dependabot PRs
-        if: github.actor == 'dependabot[bot]'
-        run: |
-          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
-            git config --local user.email "github-actions[bot]@users.noreply.github.com"
-            git config --local user.name "github-actions[bot]"
-
-            # Ensure we're on the correct branch
-            git checkout -B ${{ github.head_ref }}
-            git add -A
-            git commit -m "Apply pre-commit fixes"
-
-            # Pull latest changes from the PR branch and rebase our commit on top
-            git pull --rebase origin ${{ github.head_ref }}
-
-            # Push to the PR branch
-            git push origin ${{ github.head_ref }}
-            echo "Pre-commit fixes committed and pushed"
-          else
-            echo "No changes to commit"
-          fi
-
      - name: Verify if there are any diff files after pre-commit
-        if: github.actor != 'dependabot[bot]'
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)

      - name: Verify if there are any new files after pre-commit
-        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -1,7 +1,5 @@
 name: Test Llama Stack Build

-run-name: Test llama stack build
-
 on:
  push:
    branches:
@ -9,20 +7,20 @@ on:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'llama_stack/templates/**'
      - 'pyproject.toml'

  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'llama_stack/templates/**'
      - 'pyproject.toml'

 concurrency:
@ -33,42 +31,42 @@ jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
-      distros: ${{ steps.set-matrix.outputs.distros }}
+      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Generate Distribution List
+      - name: Generate Template List
        id: set-matrix
        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "distros=$distros" >> "$GITHUB_OUTPUT"
+          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "templates=$templates" >> "$GITHUB_OUTPUT"

  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
+        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Print build dependencies
        run: |
-          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only

      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test

      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -79,7 +77,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -92,23 +90,23 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
-          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
-          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
+          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml

      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -117,32 +115,32 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Pin distribution to UBI9 base
+      - name: Pin template to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/distributions/ci-tests/build.yaml
+          ' llama_stack/templates/starter/build.yaml

      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          uv run llama stack build --config llama_stack/templates/starter/build.yaml

      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -1,7 +1,5 @@
 name: Python Package Build Test

-run-name: Test building the llama-stack PyPI project
-
 on:
  push:
    branches:
@ -9,8 +7,6 @@ on:
  pull_request:
    branches:
      - main
-    paths-ignore:
-        - 'llama_stack/ui/**'

 jobs:
  build:
@ -21,10 +17,10 @@ jobs:

    steps:
    - name: Checkout repository
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

    - name: Install uv
-      uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
+      uses: astral-sh/setup-uv@445689ea25e0de0a23313031f5fe577c74ae45a1 # v6.3.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,70 +0,0 @@
-# This workflow should be run manually when needing to re-record tests. This happens when you have
-#  - added a new test
-#  - or changed an existing test such that a new inference call is made
-# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
-# tests and commit the recordings to the PR branch.
-name: Integration Tests (Record)
-
-run-name: Run the integration test suite from tests/integration
-
-on:
-  workflow_dispatch:
-    inputs:
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-provider:
-        description: 'Test against a specific provider'
-        type: string
-        default: 'ollama'
-      run-vision-tests:
-        description: 'Whether to run vision tests'
-        type: boolean
-        default: false
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''
-
-jobs:
-  record-tests:
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: write
-
-    steps:
-      - name: Echo workflow inputs
-        run: |
-          echo "::group::Workflow Inputs"
-          echo "test-subdirs: ${{ inputs.test-subdirs }}"
-          echo "test-provider: ${{ inputs.test-provider }}"
-          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
-          echo "test-pattern: ${{ inputs.test-pattern }}"
-          echo "branch: ${{ github.ref_name }}"
-          echo "::endgroup::"
-
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          fetch-depth: 0
-
-      - name: Setup test environment
-        uses: ./.github/actions/setup-test-environment
-        with:
-          python-version: "3.12"  # Use single Python version for recording
-          client-version: "latest"
-          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
-          inference-mode: 'record'
-
-      - name: Run and record tests
-        uses: ./.github/actions/run-and-record-tests
-        with:
-          test-pattern: ${{ inputs.test-pattern }}
-          test-subdirs: ${{ inputs.test-subdirs }}
-          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
-          provider: ${{ inputs.test-provider || 'ollama' }}
-          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -1,7 +1,5 @@
 name: Check semantic PR titles

-run-name: Ensure that PR titles follow the conventional commit spec
-
 on:
  pull_request_target:
    types:
@ -11,7 +9,7 @@ on:
      - synchronize

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 permissions:
@ -22,6 +20,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
+        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -1,7 +1,5 @@
 name: Close stale issues and PRs

-run-name: Run the Stale Bot action
-
 on:
  schedule:
    - cron: '0 0 * * *' # every day at midnight
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -1,86 +0,0 @@
-name: Test External Providers Installed via Module
-
-run-name: Test External Provider installation via Python module
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'tests/external/*'
-      - '.github/workflows/test-external-provider-module.yml' # This workflow
-
-jobs:
-  test-external-providers-from-module:
-    # This workflow is disabled. See https://github.com/meta-llama/llama-stack/pull/2975#issuecomment-3138702984 for details
-    if: false
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Install Ramalama
-        shell: bash
-        run: |
-          uv pip install ramalama
-
-      - name: Run Ramalama
-        shell: bash
-        run: |
-          nohup ramalama serve llama3.2:3b-instruct-fp16  > ramalama_server.log 2>&1 &
-      - name: Apply image type to config file
-        run: |
-          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
-          cat tests/external/ramalama-stack/run.yaml
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "llama3.2:3b-instruct-fp16"
-          LLAMA_STACK_LOG_FILE: "server.log"
-        run: |
-          # Use the virtual environment created by the build step (name comes from build config)
-          source ramalama-stack-test/bin/activate
-          uv pip list
-          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          for i in {1..30}; do
-            if ! grep -q "successfully connected to Ramalama" server.log; then
-              echo "Waiting for Llama Stack server to load the provider..."
-              sleep 1
-            else
-              echo "Provider loaded"
-              exit 0
-            fi
-          done
-          echo "Provider failed to load"
-          cat server.log
-          exit 1
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-provider-module-test
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -0,0 +1,73 @@
+name: Test External Providers
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/test-external-providers.yml' # This workflow
+
+jobs:
+  test-external-providers:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        image-type: [venv]
+        # We don't do container yet, it's tricky to install a package from the host into the
+        # container and point 'uv pip install' to the correct path...
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Apply image type to config file
+        run: |
+          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+
+      - name: Setup directory for Ollama custom provider
+        run: |
+          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
+          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
+
+      - name: Create provider configuration
+        run: |
+          mkdir -p /home/runner/.llama/providers.d/remote/inference
+          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
+
+      - name: Build distro from config file
+        run: |
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+
+      - name: Start Llama Stack server in background
+        if: ${{ matrix.image-type }} == 'venv'
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          # Use the virtual environment created by the build step (name comes from build config)
+          source ci-test/bin/activate
+          uv pip list
+          nohup llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          for i in {1..30}; do
+            if ! grep -q "Successfully loaded external provider remote::custom_ollama" server.log; then
+              echo "Waiting for Llama Stack server to load the provider..."
+              sleep 1
+            else
+              echo "Provider loaded"
+              exit 0
+            fi
+          done
+          echo "Provider failed to load"
+          cat server.log
+          exit 1
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -1,89 +0,0 @@
-name: Test External API and Providers
-
-run-name: Test the External API and Provider mechanisms
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - 'tests/external/*'
-      - '.github/workflows/test-external.yml' # This workflow
-
-jobs:
-  test-external:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Create API configuration
-        run: |
-          mkdir -p /home/runner/.llama/apis.d
-          cp tests/external/weather.yaml /home/runner/.llama/apis.d/weather.yaml
-
-      - name: Create provider configuration
-        run: |
-          mkdir -p /home/runner/.llama/providers.d/remote/weather
-          cp tests/external/kaze.yaml /home/runner/.llama/providers.d/remote/weather/kaze.yaml
-
-      - name: Print distro dependencies
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-          LLAMA_STACK_LOG_FILE: "server.log"
-        run: |
-          # Use the virtual environment created by the build step (name comes from build config)
-          source ci-test/bin/activate
-          uv pip list
-          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -sSf http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Test external API
-        run: |
-          curl -sSf http://localhost:8321/v1/weather/locations
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-test
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,69 @@
+name: auto-tests
+
+on:
+  # pull_request:
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'Specific Commit SHA to trigger on'
+        required: false
+        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
+
+jobs:
+  test-llama-stack-as-library:
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    strategy:
+      matrix:
+        provider: [fireworks, together]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: ${{ github.event.inputs.commit_sha }}
+
+      - name: Echo commit SHA
+        run: |
+          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
+          git rev-parse HEAD
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt pytest
+          pip install -e .
+
+      - name: Build providers
+        run: |
+          llama stack build --template ${{ matrix.provider }} --image-type venv
+
+      - name: Install the latest llama-stack-client & llama-models packages
+        run: |
+          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
+          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
+
+      - name: Run client-sdk test
+        working-directory: "${{ github.workspace }}"
+        env:
+          REPORT_OUTPUT: md_report.md
+        shell: bash
+        run: |
+          pip install --upgrade pytest-md-report
+          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
+
+          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
+
+      - name: Output reports to the job summary
+        if: always()
+        shell: bash
+        run: |
+          if [ -f "$REPORT_FILE" ]; then
+            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "</details>" >> $GITHUB_STEP_SUMMARY
+          fi
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -1,55 +0,0 @@
-name: UI Tests
-
-run-name: Run the UI test suite
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/ui/**'
-      - '.github/workflows/ui-unit-tests.yml' # This workflow
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  ui-tests:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        node-version: [22]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Setup Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
-        with:
-          node-version: ${{ matrix.node-version }}
-          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/package-lock.json'
-
-      - name: Install dependencies
-        working-directory: llama_stack/ui
-        run: npm ci
-
-      - name: Run linting
-        working-directory: llama_stack/ui
-        run: npm run lint
-
-      - name: Run format check
-        working-directory: llama_stack/ui
-        run: npm run format:check
-
-      - name: Run unit tests
-        working-directory: llama_stack/ui
-        env:
-          CI: true
-
-        run: npm test -- --coverage --watchAll=false --passWithNoTests
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -1,7 +1,5 @@
 name: Unit Tests

-run-name: Run the unit test suite
-
 on:
  push:
    branches: [ main ]
@ -9,7 +7,6 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,16 +29,14 @@ jobs:
          - "3.13"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-        with:
-          python-version: ${{ matrix.python }}

      - name: Run unit tests
        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}

      - name: Upload test results
        if: always()
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,7 +1,5 @@
 name: Update ReadTheDocs

-run-name: Update the Llama Stack ReadTheDocs site
-
 on:
  workflow_dispatch:
    inputs:
@ -37,7 +35,7 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,7 +2,6 @@ exclude: 'build/'

 default_language_version:
    python: python3.12
-    node: "22"

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@ -15,11 +14,12 @@ repos:
    -   id: check-added-large-files
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
-        exclude: '^(.*\.svg|.*\.md)$'
+        exclude: '^(.*\.svg)$'
    -   id: no-commit-to-branch
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
+    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
@ -29,7 +29,7 @@ repos:
    -   id: check-toml

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.5
+    rev: v1.5.4
    hooks:
    -   id: insert-license
        files: \.py$|\.sh$
@ -38,7 +38,7 @@ repos:
          - docs/license_header.txt

 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.2
+    rev: v0.9.4
    hooks:
    -   id: ruff
        args: [ --fix ]
@ -46,19 +46,27 @@ repos:
    -   id: ruff-format

 -   repo: https://github.com/adamchainz/blacken-docs
-    rev: 1.19.1
+    rev: 1.19.0
    hooks:
    -   id: blacken-docs
        additional_dependencies:
        - black==24.3.0

 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.20
+    rev: 0.7.8
    hooks:
    -   id: uv-lock
+    -   id: uv-export
+        args: [
+            "--frozen",
+            "--no-hashes",
+            "--no-emit-project",
+            "--no-default-groups",
+            "--output-file=requirements.txt"
+        ]

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
+    rev: v1.15.0
    hooks:
    -   id: mypy
        additional_dependencies:
@ -87,15 +95,6 @@ repos:
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
-      - id: provider-codegen
-        name: Provider Codegen
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/provider_codegen.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^llama_stack/providers/.*$
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
@ -121,81 +120,7 @@ repos:
        require_serial: true
        always_run: true
        files: ^llama_stack/.*$
-      - id: forbid-pytest-asyncio
-        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
-        entry: bash
-        language: system
-        types: [python]
-        pass_filenames: true
-        args:
-          - -c
-          - |
-            grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
-              echo;
-              echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
-              echo "   pytest is already configured with async-mode=auto."
-              echo;
-              exit 1;
-            } || true
-      - id: generate-ci-docs
-        name: Generate CI documentation
-        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run ./scripts/gen-ci-docs.py
-        language: python
-        pass_filenames: false
-        require_serial: true
-        files: ^.github/workflows/.*$
-      # ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-      # and until we have infra for installing prettier and next via npm -
-      #   Lint UI code with ESLint.....................................................Failed
-      #   - hook id: ui-eslint
-      #   - exit code: 127
-      #   > ui@0.1.0 lint
-      #   > next lint --fix --quiet
-      #   sh: line 1: next: command not found
-      #
-      # - id: ui-prettier
-      #   name: Format UI code with Prettier
-      #   entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-      # - id: ui-eslint
-      #   name: Lint UI code with ESLint
-      #   entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-
-      - id: check-log-usage
-        name: Ensure 'llama_stack.log' usage for logging
-        entry: bash
-        language: system
-        types: [python]
-        pass_filenames: true
-        args:
-          - -c
-          - |
-            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
-            if [ -n "$matches" ]; then
-              # GitHub Actions annotation format
-              while IFS=: read -r file line_num rest; do
-                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
-              done <<< "$matches"
-              exit 1
-            fi
-            exit 0

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
-    autofix_prs: true
-    autoupdate_branch: ''
-    autoupdate_schedule: weekly
-    skip: []
-    submodules: false
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,34 +1,5 @@
 # Changelog

-# v0.2.15
-Published on: 2025-07-16T03:30:01Z
-
-
-
---
-
-# v0.2.14
-Published on: 2025-07-04T16:06:48Z
-
-## Highlights
-
-* Support for Llama Guard 4
-* Added Milvus  support to vector-stores API
-* Documentation and zero-to-hero updates for latest APIs
-
-
---
-
-# v0.2.13
-Published on: 2025-06-28T04:28:11Z
-
-## Highlights
-* search_mode support in OpenAI vector store API
-* Security fixes
-
-
---
-
 # v0.2.12
 Published on: 2025-06-20T22:52:12Z

@ -451,7 +422,7 @@ GenAI application developers need more than just an LLM - they need to integrate

 Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.

-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
+With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.

 ## Release
 After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
@ -514,3 +485,23 @@ A small but important bug-fix release to update the URL datatype for the client-

 ---

+# v0.0.62
+Published on: 2024-12-18T02:39:43Z
+
+
+
+---
+
+# v0.0.61
+Published on: 2024-12-10T20:50:33Z
+
+
+
+---
+
+# v0.0.55
+Published on: 2024-11-23T17:14:07Z
+
+
+
+---
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,91 +1,17 @@
-# Contributing to Llama Stack
+# Contributing to Llama-Stack
 We want to make contributing to this project as easy and transparent as
 possible.

-## Set up your development environment
-
-We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
-You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
-
-You can install the dependencies by running:
-
-```bash
-cd llama-stack
-uv sync --group dev
-uv pip install -e .
-source .venv/bin/activate
-```
-
-```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
-For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-```
-
-Note that you can create a dotenv file `.env` that includes necessary environment variables:
-```
-LLAMA_STACK_BASE_URL=http://localhost:8321
-LLAMA_STACK_CLIENT_LOG=debug
-LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=<provider-name>
-TAVILY_SEARCH_API_KEY=
-BRAVE_SEARCH_API_KEY=
-```
-
-And then use this dotenv file when running client SDK tests via the following:
-```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
-```
-
-### Pre-commit Hooks
-
-We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
-
-```bash
-uv run pre-commit install
-```
-
-After that, pre-commit hooks will run automatically before each commit.
-
-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
-
-```bash
-uv run pre-commit run --all-files
-```
-
-```{caution}
-Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-```
-
 ## Discussions -> Issues -> Pull Requests

 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).

 If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.

-### Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
-### Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Meta's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
-
 **I'd like to contribute!**

-If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
-leave a comment on the issue and a triager will assign it to you.
-
-Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
- Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
- Before taking an issue, check if it’s already assigned or being actively discussed.
- If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
+All issues are actionable (please report if they are not.) Pick one and start working on it. Thank you.
+If you need help or guidance, comment on the issue. Issues that are extra friendly to new contributors are tagged with "contributor friendly".

 **I have a bug!**

@ -115,20 +41,89 @@ Please avoid picking up too many issues at once. This helps you stay focused and
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
 6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
-7. Ensure your pull request follows the [coding style](#coding-style).
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.


-Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
+## Set up your development environment

-```{tip}
-As a general guideline:
- Experienced contributors should try to keep no more than 5 open PRs at a time.
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
+We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
+You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+
+You can install the dependencies by running:
+
+```bash
+cd llama-stack
+uv sync --extra dev
+uv pip install -e .
+source .venv/bin/activate
 ```

-## Repository guidelines
+> [!NOTE]
+> You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`)
+> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).

-### Coding Style
+Note that you can create a dotenv file `.env` that includes necessary environment variables:
+```
+LLAMA_STACK_BASE_URL=http://localhost:8321
+LLAMA_STACK_CLIENT_LOG=debug
+LLAMA_STACK_PORT=8321
+LLAMA_STACK_CONFIG=<provider-name>
+TAVILY_SEARCH_API_KEY=
+BRAVE_SEARCH_API_KEY=
+```
+
+And then use this dotenv file when running client SDK tests via the following:
+```bash
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+## Pre-commit Hooks
+
+We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
+
+```bash
+uv run pre-commit install
+```
+
+After that, pre-commit hooks will run automatically before each commit.
+
+Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+
+```bash
+uv run pre-commit run --all-files
+```
+
+> [!CAUTION]
+> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
+
+## Running tests
+
+You can find the Llama Stack testing documentation here [here](tests/README.md).
+
+## Adding a new dependency to the project
+
+To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
+
+```bash
+uv add foo
+uv sync
+```
+
+## Coding Style

 * Comments should provide meaningful insights into the code. Avoid filler comments that simply
  describe the next step, as they create unnecessary clutter, same goes for docstrings.
@ -144,15 +139,6 @@ As a general guideline:
  justification for bypassing the check.
 * Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
  readability reasons.
-* Providers configuration class should be Pydantic Field class. It should have a `description` field
-  that describes the configuration. These descriptions will be used to generate the provider
-  documentation.
-* When possible, use keyword arguments only when calling functions.
-* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
-
-### License
-By contributing to Llama, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.

 ## Common Tasks

@ -160,7 +146,7 @@ Some tips about common tasks you work on while contributing to Llama Stack:

 ### Using `llama stack build`

-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.

 Example:
 ```bash
@ -168,22 +154,13 @@ cd work/
 git clone https://github.com/meta-llama/llama-stack.git
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
 ```

-### Updating distribution configurations

-If you have made changes to a provider's configuration in any form (introducing a new config key, or
-changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML
-files as well as the documentation. You should not change `docs/source/.../distributions/` files
-manually as they are auto-generated.
+### Updating Provider Configurations

-### Updating the provider documentation
-
-If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
-to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
-as they are auto-generated.
-Note that the provider "description" field will be used to generate the provider documentation.
+If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.

 ### Building the Documentation

@ -206,3 +183,7 @@ uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```

 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
+
+## License
+By contributing to Llama, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,9 @@
 include pyproject.toml
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/core/*.sh
+include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
-include llama_stack/distributions/*/*.yaml
+include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -9,7 +9,6 @@

 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)

-
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.

@ -36,8 +35,6 @@ pip install llama-stack-client
 ### CLI
 ```bash
 # Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-
 llama-stack-client --endpoint http://localhost:8321 \
 inference chat-completion \
 --model-id meta-llama/$MODEL \
@ -78,7 +75,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:

 ```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | bash
 ```

 ### Overview
@ -109,49 +106,47 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on

 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)

-| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
-|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
-|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
-|       Cerebras       | Hosted | | ✅ | | | | | | |
-|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
-|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
-|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
-|         Groq         | Hosted | | ✅ | | | | | | |
-|        Ollama        | Single Node | | ✅ | | | | | | |
-|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
-|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
-|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
-|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
-|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
-|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
-|      SQLite-vec      | Single Node | | | ✅ | | | | | |
-|      PG Vector       | Single Node | | | ✅ | | | | | |
-|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
-|         vLLM         | Single Node | | ✅ | | | | | | |
-|        OpenAI        | Hosted | | ✅ | | | | | | |
-|      Anthropic       | Hosted | | ✅ | | | | | | |
-|        Gemini        | Hosted | | ✅ | | | | | | |
-|       WatsonX        | Hosted | | ✅ | | | | | | |
-|     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
-|      TorchTune       | Single Node | | | | | | ✅ | | |
-|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
-|        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
+|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
+|          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
+|        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
+|         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
+|       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |

-> **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.

 ### Distributions

-A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code.
-Here are some of the distributions we support:
+A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code. Here are some of the distributions we support:

 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
-|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html)      |
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
-|                   PostgreSQL                  |                [llamastack/distribution-postgres-demo](https://hub.docker.com/repository/docker/llamastack/distribution-postgres-demo/general)                |                  |
+|                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
+|                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
+|                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
+|                      TGI                      |                          [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)                          |             [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html)              |
+|                   Together                    |                     [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)                     |           [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html)           |
+|                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
+| vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |
+

 ### Documentation

@ -180,17 +175,3 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
 Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.

 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
-
-
-## 🌟 GitHub Star History
-## Star History
-
-[![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date)
-
-## ✨ Contributors
-
-Thanks to all of our amazing contributors!
-
-<a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
-</a>
--- a/coverage.svg
+++ b/coverage.svg
@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
-    <linearGradient id="b" x2="0" y2="100%">
-        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
-        <stop offset="1" stop-opacity=".1"/>
-    </linearGradient>
-    <mask id="a">
-        <rect width="99" height="20" rx="3" fill="#fff"/>
-    </mask>
-    <g mask="url(#a)">
-        <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
-        <path fill="url(#b)" d="M0 0h99v20H0z"/>
-    </g>
-    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
-        <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
-        <text x="80" y="14">44%</text>
-    </g>
-</svg>
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
-document.addEventListener('keydown', function(event) {
-  // command+K or ctrl+K
-  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-
-  // forward slash
-  if (event.key === '/' &&
-      !event.target.matches('input, textarea, select')) {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -17,9 +17,7 @@
        "\n",
        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
        "\n",
-        "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n",
-        "\n",
-        "**💡 Quick Start Option:** If you want a simpler and faster way to test out Llama Stack, check out the [quick_start.ipynb](quick_start.ipynb) notebook instead. It provides a streamlined experience for getting up and running in just a few steps.\n"
+        "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n"
      ]
    },
    {
@ -123,7 +121,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
-        "!uv run --with llama-stack llama stack build --distro together --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --template together --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -165,7 +163,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -17,9 +17,7 @@
        "\n",
        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
        "\n",
-        "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n",
-        "\n",
-        "**💡 Quick Start Option:** If you want a simpler and faster way to test out Llama Stack, check out the [quick_start.ipynb](quick_start.ipynb) notebook instead. It provides a streamlined experience for getting up and running in just a few steps.\n"
+        "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n"
      ]
    },
    {
@ -233,7 +231,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server \n",
-        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --template meta-reference-gpu --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -275,7 +273,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@ -17,9 +17,7 @@
          "\n",
          "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
          "\n",
-          "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n",
-          "\n",
-          "**💡 Quick Start Option:** If you want a simpler and faster way to test out Llama Stack, check out the [quick_start.ipynb](quick_start.ipynb) notebook instead. It provides a streamlined experience for getting up and running in just a few steps.\n"
+          "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n"
        ]
      },
      {
@ -223,7 +221,7 @@
          "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
          "\n",
          "# this command installs all the dependencies needed for the llama stack server \n",
-          "!uv run --with llama-stack llama stack build --distro llama_api --image-type venv \n",
+          "!uv run --with llama-stack llama stack build --template llama_api --image-type venv \n",
          "\n",
          "def run_llama_stack_server_background():\n",
          "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -265,7 +263,7 @@
          "# use this helper if needed to kill the server \n",
          "def kill_llama_stack_server():\n",
          "    # Kill any existing llama stack server processes\n",
-          "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
+          "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
        ]
      },
      {
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -37,7 +37,7 @@
        "\n",
        "To learn more about torchtune: https://github.com/pytorch/torchtune\n",
        "\n",
-        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/experimental-post-training) as the distribution template\n",
+        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/templates/experimental-post-training) as the distribution template\n",
        "\n",
        "####  0.0. Prerequisite: Have an OpenAI API key\n",
        "In this showcase, we will use [braintrust](https://www.braintrust.dev/) as scoring provider for eval and it uses OpenAI model as judge model for scoring. So, you need to get an API key from [OpenAI developer platform](https://platform.openai.com/docs/overview).\n",
@ -2864,7 +2864,7 @@
        }
      ],
      "source": [
-        "!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
+        "!llama stack build --template experimental-post-training --image-type venv --image-name __system__"
      ]
    },
    {
@ -3216,19 +3216,19 @@
            "INFO:datasets:Duckdb version 1.1.3 available.\n",
            "INFO:datasets:TensorFlow version 2.18.0 available.\n",
            "INFO:datasets:JAX version 0.4.33 available.\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: basic::equality served by basic\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: basic::subset_of served by basic\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
-            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
-            "INFO:llama_stack.core.stack:\n"
+            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::equality served by basic\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::subset_of served by basic\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
+            "INFO:llama_stack.distribution.stack:\n"
          ]
        },
        {
@ -3448,7 +3448,7 @@
        "\n",
        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
        "\n",
-        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
        "client = LlamaStackAsLibraryClient(\"experimental-post-training\")\n",
        "_ = client.initialize()"
      ]
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -38,7 +38,7 @@
   "source": [
    "# NBVAL_SKIP\n",
    "!pip install -U llama-stack\n",
-    "!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
+    "!UV_SYSTEM_PYTHON=1 llama stack build --template fireworks --image-type venv"
   ]
  },
  {
@ -48,7 +48,7 @@
   "outputs": [],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -57,7 +57,7 @@
      "outputs": [],
      "source": [
        "# NBVAL_SKIP\n",
-        "!UV_SYSTEM_PYTHON=1 llama stack build --distro together --image-type venv"
+        "!UV_SYSTEM_PYTHON=1 llama stack build --template together --image-type venv"
      ]
    },
    {
@ -661,7 +661,7 @@
        "except ImportError:\n",
        "    print(\"Not in Google Colab environment\")\n",
        "\n",
-        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
        "\n",
        "client = LlamaStackAsLibraryClient(\"together\")\n",
        "_ = client.initialize()"
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -35,7 +35,7 @@
   ],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -92,7 +92,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
    "```"
   ]
  },
@ -194,7 +194,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client =  LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
@ -81,7 +81,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
    "```"
   ]
  },
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1 +1 @@
-The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.
+The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -17,7 +17,7 @@ import fire
 import ruamel.yaml as yaml

 from llama_stack.apis.version import LLAMA_STACK_API_VERSION  # noqa: E402
-from llama_stack.core.stack import LlamaStack  # noqa: E402
+from llama_stack.distribution.stack import LlamaStack  # noqa: E402

 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -12,7 +12,7 @@ from typing import TextIO
 from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args

 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
-from llama_stack.core.resolver import api_protocol_map
+from llama_stack.distribution.resolver import api_protocol_map

 from .generator import Generator
 from .options import Options
@ -156,7 +156,7 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
    
    # Allow OpenAI endpoints to return response objects since they follow OpenAI specification
    method_name = getattr(method, '__name__', '')
-    if method_name.__contains__('openai_'):
+    if method_name.startswith('openai_'):
        return None
    
    if return_type is not None and return_type is not type(None):
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -1,366 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "c1e7571c",
-      "metadata": {
-        "id": "c1e7571c"
-      },
-      "source": [
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
-        "\n",
-        "# Llama Stack - Building AI Applications\n",
-        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
-        "\n",
-        "Get started with Llama Stack in minutes!\n",
-        "\n",
-        "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
-        "\n",
-        "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
-        "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "4CV1Q19BDMVw",
-      "metadata": {
-        "id": "4CV1Q19BDMVw"
-      },
-      "source": [
-        "## Step 1: Install and setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "K4AvfUAJZOeS",
-      "metadata": {
-        "id": "K4AvfUAJZOeS"
-      },
-      "source": [
-        "### 1.1. Install uv and test inference with Ollama\n",
-        "\n",
-        "We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "7a2d7b85",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%pip install uv llama_stack llama-stack-client\n",
-        "\n",
-        "## If running on Collab:\n",
-        "# !pip install colab-xterm\n",
-        "# %load_ext colabxterm\n",
-        "\n",
-        "!curl https://ollama.ai/install.sh | sh"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "39fa584b",
-      "metadata": {},
-      "source": [
-        "### 1.2. Test inference with Ollama"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "3bf81522",
-      "metadata": {},
-      "source": [
-        "We’ll now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a7e8e0f1",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "## If running on Colab:\n",
-        "# %xterm\n",
-        "\n",
-        "## To be ran in the terminal:\n",
-        "# ollama serve &\n",
-        "# ollama run llama3.2:3b --keepalive 60m"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "f3c5f243",
-      "metadata": {},
-      "source": [
-        "If successful, you should see the model respond to a prompt.\n",
-        "\n",
-        "...\n",
-        "```\n",
-        ">>> hi\n",
-        "Hello! How can I assist you today?\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "oDUB7M_qe-Gs",
-      "metadata": {
-        "id": "oDUB7M_qe-Gs"
-      },
-      "source": [
-        "## Step 2: Run the Llama Stack server\n",
-        "\n",
-        "In this showcase, we will start a Llama Stack server that is running locally."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "732eadc6",
-      "metadata": {},
-      "source": [
-        "### 2.1. Setup the Llama Stack Server"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "J2kGed0R5PSf",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "collapsed": true,
-        "id": "J2kGed0R5PSf",
-        "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
-      },
-      "outputs": [],
-      "source": [
-        "import os \n",
-        "import subprocess\n",
-        "\n",
-        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
-        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
-        "\n",
-        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-        "!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
-        "\n",
-        "def run_llama_stack_server_background():\n",
-        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
-        "    process = subprocess.Popen(\n",
-        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
-        "        shell=True,\n",
-        "        stdout=log_file,\n",
-        "        stderr=log_file,\n",
-        "        text=True\n",
-        "    )\n",
-        "    \n",
-        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
-        "    return process\n",
-        "\n",
-        "def wait_for_server_to_start():\n",
-        "    import requests\n",
-        "    from requests.exceptions import ConnectionError\n",
-        "    import time\n",
-        "    \n",
-        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
-        "    max_retries = 30\n",
-        "    retry_interval = 1\n",
-        "    \n",
-        "    print(\"Waiting for server to start\", end=\"\")\n",
-        "    for _ in range(max_retries):\n",
-        "        try:\n",
-        "            response = requests.get(url)\n",
-        "            if response.status_code == 200:\n",
-        "                print(\"\\nServer is ready!\")\n",
-        "                return True\n",
-        "        except ConnectionError:\n",
-        "            print(\".\", end=\"\", flush=True)\n",
-        "            time.sleep(retry_interval)\n",
-        "            \n",
-        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
-        "    return False\n",
-        "\n",
-        "\n",
-        "# use this helper if needed to kill the server \n",
-        "def kill_llama_stack_server():\n",
-        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "c40e9efd",
-      "metadata": {},
-      "source": [
-        "### 2.2. Start the Llama Stack Server"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "id": "f779283d",
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Starting Llama Stack server with PID: 787100\n",
-            "Waiting for server to start\n",
-            "Server is ready!\n"
-          ]
-        }
-      ],
-      "source": [
-        "server_process = run_llama_stack_server_background()\n",
-        "assert wait_for_server_to_start()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "28477c03",
-      "metadata": {},
-      "source": [
-        "## Step 3: Run the demo"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "id": "7da71011",
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html\n",
-            "prompt> How do you do great work?\n",
-            "\u001b[33minference> \u001b[0m\u001b[33m[k\u001b[0m\u001b[33mnowledge\u001b[0m\u001b[33m_search\u001b[0m\u001b[33m(query\u001b[0m\u001b[33m=\"\u001b[0m\u001b[33mWhat\u001b[0m\u001b[33m is\u001b[0m\u001b[33m the\u001b[0m\u001b[33m key\u001b[0m\u001b[33m to\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m\")]\u001b[0m\u001b[97m\u001b[0m\n",
-            "\u001b[32mtool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}\u001b[0m\n",
-            "\u001b[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n', type='text'), TextContentItem(text=\"Result 1:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 2:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 3:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 4:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 5:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text='END of knowledge_search tool results.\\n', type='text'), TextContentItem(text='The above results were retrieved to help answer the user\\'s query: \"What is the key to doing great work\". Use them as supporting information only in answering this query.\\n', type='text')]\u001b[0m\n",
-            "\u001b[33minference> \u001b[0m\u001b[33mDoing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m means\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m something\u001b[0m\u001b[33m important\u001b[0m\u001b[33m so\u001b[0m\u001b[33m well\u001b[0m\u001b[33m that\u001b[0m\u001b[33m you\u001b[0m\u001b[33m expand\u001b[0m\u001b[33m people\u001b[0m\u001b[33m's\u001b[0m\u001b[33m ideas\u001b[0m\u001b[33m of\u001b[0m\u001b[33m what\u001b[0m\u001b[33m's\u001b[0m\u001b[33m possible\u001b[0m\u001b[33m.\u001b[0m\u001b[33m However\u001b[0m\u001b[33m,\u001b[0m\u001b[33m there\u001b[0m\u001b[33m's\u001b[0m\u001b[33m no\u001b[0m\u001b[33m threshold\u001b[0m\u001b[33m for\u001b[0m\u001b[33m importance\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m's\u001b[0m\u001b[33m often\u001b[0m\u001b[33m hard\u001b[0m\u001b[33m to\u001b[0m\u001b[33m judge\u001b[0m\u001b[33m at\u001b[0m\u001b[33m the\u001b[0m\u001b[33m time\u001b[0m\u001b[33m anyway\u001b[0m\u001b[33m.\u001b[0m\u001b[33m Great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m is\u001b[0m\u001b[33m a\u001b[0m\u001b[33m matter\u001b[0m\u001b[33m of\u001b[0m\u001b[33m degree\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m can\u001b[0m\u001b[33m be\u001b[0m\u001b[33m difficult\u001b[0m\u001b[33m to\u001b[0m\u001b[33m determine\u001b[0m\u001b[33m whether\u001b[0m\u001b[33m someone\u001b[0m\u001b[33m has\u001b[0m\u001b[33m done\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m until\u001b[0m\u001b[33m after\u001b[0m\u001b[33m the\u001b[0m\u001b[33m fact\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
-            "\u001b[30m\u001b[0m"
-          ]
-        }
-      ],
-      "source": [
-        "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
-        "\n",
-        "vector_db_id = \"my_demo_vector_db\"\n",
-        "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
-        "\n",
-        "models = client.models.list()\n",
-        "\n",
-        "# Select the first ollama and first ollama's embedding model\n",
-        "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
-        "embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
-        "embedding_model_id = embedding_model.identifier\n",
-        "embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
-        "\n",
-        "_ = client.vector_dbs.register(\n",
-        "    vector_db_id=vector_db_id,\n",
-        "    embedding_model=embedding_model_id,\n",
-        "    embedding_dimension=embedding_dimension,\n",
-        "    provider_id=\"faiss\",\n",
-        ")\n",
-        "source = \"https://www.paulgraham.com/greatwork.html\"\n",
-        "print(\"rag_tool> Ingesting document:\", source)\n",
-        "document = RAGDocument(\n",
-        "    document_id=\"document_1\",\n",
-        "    content=source,\n",
-        "    mime_type=\"text/html\",\n",
-        "    metadata={},\n",
-        ")\n",
-        "client.tool_runtime.rag_tool.insert(\n",
-        "    documents=[document],\n",
-        "    vector_db_id=vector_db_id,\n",
-        "    chunk_size_in_tokens=50,\n",
-        ")\n",
-        "agent = Agent(\n",
-        "    client,\n",
-        "    model=model_id,\n",
-        "    instructions=\"You are a helpful assistant\",\n",
-        "    tools=[\n",
-        "        {\n",
-        "            \"name\": \"builtin::rag/knowledge_search\",\n",
-        "            \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
-        "        }\n",
-        "    ],\n",
-        ")\n",
-        "\n",
-        "prompt = \"How do you do great work?\"\n",
-        "print(\"prompt>\", prompt)\n",
-        "\n",
-        "response = agent.create_turn(\n",
-        "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
-        "    session_id=agent.create_session(\"rag_session\"),\n",
-        "    stream=True,\n",
-        ")\n",
-        "\n",
-        "for log in AgentEventLogger().log(response):\n",
-        "    log.print()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "341aaadf",
-      "metadata": {},
-      "source": [
-        "Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "e88e1185",
-      "metadata": {},
-      "source": [
-        "## Next Steps"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "bcb73600",
-      "metadata": {},
-      "source": [
-        "Now you're ready to dive deeper into Llama Stack!\n",
-        "- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
-        "- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
-        "- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
-        "- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
-        "- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
-        "- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
-        "- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.6"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
-}
--- a/docs/readme.md
+++ b/docs/readme.md
--- a/docs/source/advanced_apis/eval/index.md
+++ b/docs/source/advanced_apis/eval/index.md
@ -1,6 +0,0 @@
-# Eval Providers
-
-This section contains documentation for all available providers for the **eval** API.
-
- [inline::meta-reference](inline_meta-reference.md)
- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/eval/inline_meta-reference.md
+++ b/docs/source/advanced_apis/eval/inline_meta-reference.md
@ -1,25 +0,0 @@
---
-orphan: true
---
-
-# inline::meta-reference
-
-## Description
-
-Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
-
-## Sample Configuration
-
-```yaml
-kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
-
-```
-
--- a/docs/source/advanced_apis/eval/remote_nvidia.md
+++ b/docs/source/advanced_apis/eval/remote_nvidia.md
@ -1,23 +0,0 @@
---
-orphan: true
---
-
-# remote::nvidia
-
-## Description
-
-NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
-
-## Sample Configuration
-
-```yaml
-evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-
-```
-
--- a/docs/source/advanced_apis/index.md
+++ b/docs/source/advanced_apis/index.md
@ -1,33 +0,0 @@
-# Advanced APIs
-
-## Post-training
-Fine-tunes a model.
-
-```{toctree}
-:maxdepth: 1
-
-post_training/index
-```
-
-## Eval
-Generates outputs (via Inference or Agents) and perform scoring.
-
-```{toctree}
-:maxdepth: 1
-
-eval/index
-```
-
-```{include} evaluation_concepts.md
-:start-after: ## Evaluation Concepts
-```
-
-## Scoring
-Evaluates the outputs of the system.
-
-```{toctree}
-:maxdepth: 1
-
-scoring/index
-```
-
--- a/docs/source/advanced_apis/post_training/index.md
+++ b/docs/source/advanced_apis/post_training/index.md
@ -1,7 +0,0 @@
-# Post_Training Providers
-
-This section contains documentation for all available providers for the **post_training** API.
-
- [inline::huggingface](inline_huggingface.md)
- [inline::torchtune](inline_torchtune.md)
- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/post_training/inline_huggingface.md
+++ b/docs/source/advanced_apis/post_training/inline_huggingface.md
@ -1,37 +0,0 @@
---
-orphan: true
---
-
-# inline::huggingface
-
-## Description
-
-HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `device` | `<class 'str'>` | No | cuda |  |
-| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
-| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
-| `chat_template` | `<class 'str'>` | No | |
-| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
-| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
-| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
-| `save_total_limit` | `<class 'int'>` | No | 3 |  |
-| `logging_steps` | `<class 'int'>` | No | 10 |  |
-| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
-| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
-| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
-| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
-
-## Sample Configuration
-
-```yaml
-checkpoint_format: huggingface
-distributed_backend: null
-device: cpu
-
-```
-
--- a/docs/source/advanced_apis/post_training/inline_torchtune.md
+++ b/docs/source/advanced_apis/post_training/inline_torchtune.md
@ -1,24 +0,0 @@
---
-orphan: true
---
-
-# inline::torchtune
-
-## Description
-
-TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `torch_seed` | `int \| None` | No |  |  |
-| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
-
-## Sample Configuration
-
-```yaml
-checkpoint_format: meta
-
-```
-
--- a/docs/source/advanced_apis/post_training/remote_nvidia.md
+++ b/docs/source/advanced_apis/post_training/remote_nvidia.md
@ -1,32 +0,0 @@
---
-orphan: true
---
-
-# remote::nvidia
-
-## Description
-
-NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `api_key` | `str \| None` | No |  | The NVIDIA API key. |
-| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
-| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
-| `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
-| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
-| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
-| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
-
-## Sample Configuration
-
-```yaml
-api_key: ${env.NVIDIA_API_KEY:=}
-dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
-
-```
-
--- a/docs/source/advanced_apis/scoring/index.md
+++ b/docs/source/advanced_apis/scoring/index.md
@ -1,7 +0,0 @@
-# Scoring Providers
-
-This section contains documentation for all available providers for the **scoring** API.
-
- [inline::basic](inline_basic.md)
- [inline::braintrust](inline_braintrust.md)
- [inline::llm-as-judge](inline_llm-as-judge.md)
--- a/docs/source/advanced_apis/scoring/inline_basic.md
+++ b/docs/source/advanced_apis/scoring/inline_basic.md
@ -1,17 +0,0 @@
---
-orphan: true
---
-
-# inline::basic
-
-## Description
-
-Basic scoring provider for simple evaluation metrics and scoring functions.
-
-## Sample Configuration
-
-```yaml
-{}
-
-```
-
--- a/docs/source/advanced_apis/scoring/inline_braintrust.md
+++ b/docs/source/advanced_apis/scoring/inline_braintrust.md
@ -1,23 +0,0 @@
---
-orphan: true
---
-
-# inline::braintrust
-
-## Description
-
-Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `openai_api_key` | `str \| None` | No |  | The OpenAI API Key |
-
-## Sample Configuration
-
-```yaml
-openai_api_key: ${env.OPENAI_API_KEY:=}
-
-```
-
--- a/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
+++ b/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
@ -1,17 +0,0 @@
---
-orphan: true
---
-
-# inline::llm-as-judge
-
-## Description
-
-LLM-as-judge scoring provider that uses language models to evaluate and score responses.
-
-## Sample Configuration
-
-```yaml
-{}
-
-```
-
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -1,392 +0,0 @@
-# External APIs
-
-Llama Stack supports external APIs that live outside of the main codebase. This allows you to:
- Create and maintain your own APIs independently
- Share APIs with others without contributing to the main codebase
- Keep API-specific code separate from the core Llama Stack code
-
-## Configuration
-
-To enable external APIs, you need to configure the `external_apis_dir` in your Llama Stack configuration. This directory should contain your external API specifications:
-
-```yaml
-external_apis_dir: ~/.llama/apis.d/
-```
-
-## Directory Structure
-
-The external APIs directory should follow this structure:
-
-```
-apis.d/
-  custom_api1.yaml
-  custom_api2.yaml
-```
-
-Each YAML file in these directories defines an API specification.
-
-## API Specification
-
-Here's an example of an external API specification for a weather API:
-
-```yaml
-module: weather
-api_dependencies:
-  - inference
-protocol: WeatherAPI
-name: weather
-pip_packages:
-  - llama-stack-api-weather
-```
-
-### API Specification Fields
-
- `module`: Python module containing the API implementation
- `protocol`: Name of the protocol class for the API
- `name`: Name of the API
- `pip_packages`: List of pip packages to install the API, typically a single package
-
-## Required Implementation
-
-External APIs must expose a `available_providers()` function in their module that returns a list of provider names:
-
-```python
-# llama_stack_api_weather/api.py
-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
-
-
-def available_providers() -> list[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.weather,
-            provider_type="inline::darksky",
-            pip_packages=[],
-            module="llama_stack_provider_darksky",
-            config_class="llama_stack_provider_darksky.DarkSkyWeatherImplConfig",
-        ),
-    ]
-```
-
-A Protocol class like so:
-
-```python
-# llama_stack_api_weather/api.py
-from typing import Protocol
-
-from llama_stack.schema_utils import webmethod
-
-
-class WeatherAPI(Protocol):
-    """
-    A protocol for the Weather API.
-    """
-
-    @webmethod(route="/locations", method="GET")
-    async def get_available_locations() -> dict[str, list[str]]:
-        """
-        Get the available locations.
-        """
-        ...
-```
-
-## Example: Custom API
-
-Here's a complete example of creating and using a custom API:
-
-1. First, create the API package:
-
-```bash
-mkdir -p llama-stack-api-weather
-cd llama-stack-api-weather
-mkdir src/llama_stack_api_weather
-git init
-uv init
-```
-
-2. Edit `pyproject.toml`:
-
-```toml
-[project]
-name = "llama-stack-api-weather"
-version = "0.1.0"
-description = "Weather API for Llama Stack"
-readme = "README.md"
-requires-python = ">=3.12"
-dependencies = ["llama-stack", "pydantic"]
-
-[build-system]
-requires = ["setuptools"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.packages.find]
-where = ["src"]
-include = ["llama_stack_api_weather", "llama_stack_api_weather.*"]
-```
-
-3. Create the initial files:
-
-```bash
-touch src/llama_stack_api_weather/__init__.py
-touch src/llama_stack_api_weather/api.py
-```
-
-```python
-# llama-stack-api-weather/src/llama_stack_api_weather/__init__.py
-"""Weather API for Llama Stack."""
-
-from .api import WeatherAPI, available_providers
-
-__all__ = ["WeatherAPI", "available_providers"]
-```
-
-4. Create the API implementation:
-
-```python
-# llama-stack-api-weather/src/llama_stack_api_weather/weather.py
-from typing import Protocol
-
-from llama_stack.providers.datatypes import (
-    AdapterSpec,
-    Api,
-    ProviderSpec,
-    RemoteProviderSpec,
-)
-from llama_stack.schema_utils import webmethod
-
-
-def available_providers() -> list[ProviderSpec]:
-    return [
-        RemoteProviderSpec(
-            api=Api.weather,
-            provider_type="remote::kaze",
-            config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            adapter=AdapterSpec(
-                adapter_type="kaze",
-                module="llama_stack_provider_kaze",
-                pip_packages=["llama_stack_provider_kaze"],
-                config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            ),
-        ),
-    ]
-
-
-class WeatherProvider(Protocol):
-    """
-    A protocol for the Weather API.
-    """
-
-    @webmethod(route="/weather/locations", method="GET")
-    async def get_available_locations() -> dict[str, list[str]]:
-        """
-        Get the available locations.
-        """
-        ...
-```
-
-5. Create the API specification:
-
-```yaml
-# ~/.llama/apis.d/weather.yaml
-module: llama_stack_api_weather
-name: weather
-pip_packages: ["llama-stack-api-weather"]
-protocol: WeatherProvider
-
-```
-
-6. Install the API package:
-
-```bash
-uv pip install -e .
-```
-
-7. Configure Llama Stack to use external APIs:
-
-```yaml
-version: "2"
-image_name: "llama-stack-api-weather"
-apis:
-  - weather
-providers: {}
-external_apis_dir: ~/.llama/apis.d
-```
-
-The API will now be available at `/v1/weather/locations`.
-
-## Example: custom provider for the weather API
-
-1. Create the provider package:
-
-```bash
-mkdir -p llama-stack-provider-kaze
-cd llama-stack-provider-kaze
-uv init
-```
-
-2. Edit `pyproject.toml`:
-
-```toml
-[project]
-name = "llama-stack-provider-kaze"
-version = "0.1.0"
-description = "Kaze weather provider for Llama Stack"
-readme = "README.md"
-requires-python = ">=3.12"
-dependencies = ["llama-stack", "pydantic", "aiohttp"]
-
-[build-system]
-requires = ["setuptools"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.packages.find]
-where = ["src"]
-include = ["llama_stack_provider_kaze", "llama_stack_provider_kaze.*"]
-```
-
-3. Create the initial files:
-
-```bash
-touch src/llama_stack_provider_kaze/__init__.py
-touch src/llama_stack_provider_kaze/kaze.py
-```
-
-4. Create the provider implementation:
-
-
-Initialization function:
-
-```python
-# llama-stack-provider-kaze/src/llama_stack_provider_kaze/__init__.py
-"""Kaze weather provider for Llama Stack."""
-
-from .config import KazeProviderConfig
-from .kaze import WeatherKazeAdapter
-
-__all__ = ["KazeProviderConfig", "WeatherKazeAdapter"]
-
-
-async def get_adapter_impl(config: KazeProviderConfig, _deps):
-    from .kaze import WeatherKazeAdapter
-
-    impl = WeatherKazeAdapter(config)
-    await impl.initialize()
-    return impl
-```
-
-Configuration:
-
-```python
-# llama-stack-provider-kaze/src/llama_stack_provider_kaze/config.py
-from pydantic import BaseModel, Field
-
-
-class KazeProviderConfig(BaseModel):
-    """Configuration for the Kaze weather provider."""
-
-    base_url: str = Field(
-        "https://api.kaze.io/v1",
-        description="Base URL for the Kaze weather API",
-    )
-```
-
-Main implementation:
-
-```python
-# llama-stack-provider-kaze/src/llama_stack_provider_kaze/kaze.py
-from llama_stack_api_weather.api import WeatherProvider
-
-from .config import KazeProviderConfig
-
-
-class WeatherKazeAdapter(WeatherProvider):
-    """Kaze weather provider implementation."""
-
-    def __init__(
-        self,
-        config: KazeProviderConfig,
-    ) -> None:
-        self.config = config
-
-    async def initialize(self) -> None:
-        pass
-
-    async def get_available_locations(self) -> dict[str, list[str]]:
-        """Get available weather locations."""
-        return {"locations": ["Paris", "Tokyo"]}
-```
-
-5. Create the provider specification:
-
-```yaml
-# ~/.llama/providers.d/remote/weather/kaze.yaml
-adapter:
-  adapter_type: kaze
-  pip_packages: ["llama_stack_provider_kaze"]
-  config_class: llama_stack_provider_kaze.config.KazeProviderConfig
-  module: llama_stack_provider_kaze
-optional_api_dependencies: []
-```
-
-6. Install the provider package:
-
-```bash
-uv pip install -e .
-```
-
-7. Configure Llama Stack to use the provider:
-
-```yaml
-# ~/.llama/run-byoa.yaml
-version: "2"
-image_name: "llama-stack-api-weather"
-apis:
-  - weather
-providers:
-  weather:
-  - provider_id: kaze
-    provider_type: remote::kaze
-    config: {}
-external_apis_dir: ~/.llama/apis.d
-external_providers_dir: ~/.llama/providers.d
-server:
-  port: 8321
-```
-
-8. Run the server:
-
-```bash
-python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
-```
-
-9. Test the API:
-
-```bash
-curl -sSf http://127.0.0.1:8321/v1/weather/locations
-{"locations":["Paris","Tokyo"]}%
-```
-
-## Best Practices
-
-1. **Package Naming**: Use a clear and descriptive name for your API package.
-
-2. **Version Management**: Keep your API package versioned and compatible with the Llama Stack version you're using.
-
-3. **Dependencies**: Only include the minimum required dependencies in your API package.
-
-4. **Documentation**: Include clear documentation in your API package about:
-   - Installation requirements
-   - Configuration options
-   - API endpoints and usage
-   - Any limitations or known issues
-
-5. **Testing**: Include tests in your API package to ensure it works correctly with Llama Stack.
-
-## Troubleshooting
-
-If your external API isn't being loaded:
-
-1. Check that the `external_apis_dir` path is correct and accessible.
-2. Verify that the YAML files are properly formatted.
-3. Ensure all required Python packages are installed.
-4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more information using `LLAMA_STACK_LOGGING=all=debug`.
-5. Verify that the API package is installed in your Python environment.
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# AI Application Examples
+# Building AI Applications (Examples)

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

@ -11,7 +11,6 @@ Here are some key topics that will help you build effective agents:
 - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
 - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
- **[Agents vs Responses API](responses_vs_agents)**: Learn the differences between the Agents API and Responses API, and when to use each one.
 - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
 - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
 - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
@ -24,10 +23,8 @@ Here are some key topics that will help you build effective agents:
 rag
 agent
 agent_execution_loop
-responses_vs_agents
 tools
 evals
 telemetry
 safety
-playground/index
 ```
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -1,179 +0,0 @@
-# Agents vs OpenAI Responses API
-
-Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
-
-```{note}
-For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
-```
-
-## Overview
-
-### LLS Agents API
-The Agents API is a full-featured, stateful system designed for complex, multi-turn conversations. It maintains conversation state through persistent sessions identified by a unique session ID. The API supports comprehensive agent lifecycle management, detailed execution tracking, and rich metadata about each interaction through a structured session/turn/step hierarchy. The API can orchestrate multiple tool calls within a single turn.
-
-### OpenAI Responses API
-The OpenAI Responses API is a full-featured, stateful system designed for complex, multi-turn conversations, with direct compatibility with OpenAI's conversational patterns enhanced by LLama Stack's tool calling capabilities. It maintains conversation state by chaining responses through a `previous_response_id`, allowing interactions to branch or continue from any prior point. Each response can perform multiple tool calls within a single turn.
-
-### Key Differences
-The LLS Agents API uses the Chat Completions API on the backend for inference as it's the industry standard for building AI applications and most LLM providers are compatible with this API. For a detailed comparison between Responses and Chat Completions, see [OpenAI's documentation](https://platform.openai.com/docs/guides/responses-vs-chat-completions).
-
-Additionally, Agents let you specify input/output shields whereas Responses do not (though support is planned). Agents use a linear conversation model referenced by a single session ID. Responses, on the other hand, support branching, where each response can serve as a fork point, and conversations are tracked by the latest response ID. Responses also lets you dynamically choose the model, vector store, files, MCP servers, and more on each inference call, enabling more complex workflows. Agents require a static configuration for these components at the start of the session.
-
-Today the Agents and Responses APIs can be used independently depending on the use case. But, it is also productive to treat the APIs as complementary. It is not currently supported, but it is planned for the LLS Agents API to alternatively use the Responses API as its backend instead of the default Chat Completions API, i.e., enabling a combination of the safety features of Agents with the dynamic configuration and branching capabilities of Responses.
-
-| Feature | LLS Agents API | OpenAI Responses API |
-|---------|------------|---------------------|
-| **Conversation Management** | Linear persistent sessions | Can branch from any previous response ID |
-| **Input/Output Safety Shields** | Supported | Not yet supported |
-| **Per-call Flexibility** | Static per-session configuration | Dynamic per-call configuration |
-
-## Use Case Example: Research with Multiple Search Methods
-
-Let's compare how both APIs handle a research task where we need to:
-1. Search for current information and examples
-2. Access different information sources dynamically
-3. Continue the conversation based on search results
-
-### Agents API: Session-based configuration with safety shields
-
-```python
-# Create agent with static session configuration
-agent = Agent(
-    client,
-    model="Llama3.2-3B-Instruct",
-    instructions="You are a helpful coding assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": ["code_docs"]},
-        },
-        "builtin::code_interpreter",
-    ],
-    input_shields=["llama_guard"],
-    output_shields=["llama_guard"],
-)
-
-session_id = agent.create_session("code_session")
-
-# First turn: Search and execute
-response1 = agent.create_turn(
-    messages=[
-        {
-            "role": "user",
-            "content": "Find examples of sorting algorithms and run a bubble sort on [3,1,4,1,5]",
-        },
-    ],
-    session_id=session_id,
-)
-
-# Continue conversation in same session
-response2 = agent.create_turn(
-    messages=[
-        {
-            "role": "user",
-            "content": "Now optimize that code and test it with a larger dataset",
-        },
-    ],
-    session_id=session_id,  # Same session, maintains full context
-)
-
-# Agents API benefits:
-# ✅ Safety shields protect against malicious code execution
-# ✅ Session maintains context between code executions
-# ✅ Consistent tool configuration throughout conversation
-print(f"First result: {response1.output_message.content}")
-print(f"Optimization: {response2.output_message.content}")
-```
-
-### Responses API: Dynamic per-call configuration with branching
-
-```python
-# First response: Use web search for latest algorithms
-response1 = client.responses.create(
-    model="Llama3.2-3B-Instruct",
-    input="Search for the latest efficient sorting algorithms and their performance comparisons",
-    tools=[
-        {
-            "type": "web_search",
-        },
-    ],  # Web search for current information
-)
-
-# Continue conversation: Switch to file search for local docs
-response2 = client.responses.create(
-    model="Llama3.2-1B-Instruct",  # Switch to faster model
-    input="Now search my uploaded files for existing sorting implementations",
-    tools=[
-        {  # Using Responses API built-in tools
-            "type": "file_search",
-            "vector_store_ids": ["vs_abc123"],  # Vector store containing uploaded files
-        },
-    ],
-    previous_response_id=response1.id,
-)
-
-# Branch from first response: Try different search approach
-response3 = client.responses.create(
-    model="Llama3.2-3B-Instruct",
-    input="Instead, search the web for Python-specific sorting best practices",
-    tools=[{"type": "web_search"}],  # Different web search query
-    previous_response_id=response1.id,  # Branch from response1
-)
-
-# Responses API benefits:
-# ✅ Dynamic tool switching (web search ↔ file search per call)
-# ✅ OpenAI-compatible tool patterns (web_search, file_search)
-# ✅ Branch conversations to explore different information sources
-# ✅ Model flexibility per search type
-print(f"Web search results: {response1.output_message.content}")
-print(f"File search results: {response2.output_message.content}")
-print(f"Alternative web search: {response3.output_message.content}")
-```
-
-Both APIs demonstrate distinct strengths that make them valuable on their own for different scenarios. The Agents API excels in providing structured, safety-conscious workflows with persistent session management, while the Responses API offers flexibility through dynamic configuration and OpenAI compatible tool patterns.
-
-## Use Case Examples
-
-### 1. **Research and Analysis with Safety Controls**
-**Best Choice: Agents API**
-
-**Scenario:** You're building a research assistant for a financial institution that needs to analyze market data, execute code to process financial models, and search through internal compliance documents. The system must ensure all interactions are logged for regulatory compliance and protected by safety shields to prevent malicious code execution or data leaks.
-
-**Why Agents API?** The Agents API provides persistent session management for iterative research workflows, built-in safety shields to protect against malicious code in financial models, and structured execution logs (session/turn/step) required for regulatory compliance. The static tool configuration ensures consistent access to your knowledge base and code interpreter throughout the entire research session.
-
-### 2. **Dynamic Information Gathering with Branching Exploration**
-**Best Choice: Responses API**
-
-**Scenario:** You're building a competitive intelligence tool that helps businesses research market trends. Users need to dynamically switch between web search for current market data and file search through uploaded industry reports. They also want to branch conversations to explore different market segments simultaneously and experiment with different models for various analysis types.
-
-**Why Responses API?** The Responses API's branching capability lets users explore multiple market segments from any research point. Dynamic per-call configuration allows switching between web search and file search as needed, while experimenting with different models (faster models for quick searches, more powerful models for deep analysis). The OpenAI-compatible tool patterns make integration straightforward.
-
-### 3. **OpenAI Migration with Advanced Tool Capabilities**
-**Best Choice: Responses API**
-
-**Scenario:** You have an existing application built with OpenAI's Assistants API that uses file search and web search capabilities. You want to migrate to Llama Stack for better performance and cost control while maintaining the same tool calling patterns and adding new capabilities like dynamic vector store selection.
-
-**Why Responses API?** The Responses API provides full OpenAI tool compatibility (`web_search`, `file_search`) with identical syntax, making migration seamless. The dynamic per-call configuration enables advanced features like switching vector stores per query or changing models based on query complexity - capabilities that extend beyond basic OpenAI functionality while maintaining compatibility.
-
-### 4. **Educational Programming Tutor**
-**Best Choice: Agents API**
-
-**Scenario:** You're building a programming tutor that maintains student context across multiple sessions, safely executes code exercises, and tracks learning progress with audit trails for educators.
-
-**Why Agents API?** Persistent sessions remember student progress across multiple interactions, safety shields prevent malicious code execution while allowing legitimate programming exercises, and structured execution logs help educators track learning patterns.
-
-### 5. **Advanced Software Debugging Assistant**
-**Best Choice: Agents API with Responses Backend**
-
-**Scenario:** You're building a debugging assistant that helps developers troubleshoot complex issues. It needs to maintain context throughout a debugging session, safely execute diagnostic code, switch between different analysis tools dynamically, and branch conversations to explore multiple potential causes simultaneously.
-
-**Why Agents + Responses?** The Agent provides safety shields for code execution and session management for the overall debugging workflow. The underlying Responses API enables dynamic model selection and flexible tool configuration per query, while branching lets you explore different theories (memory leak vs. concurrency issue) from the same debugging point and compare results.
-
-> **Note:** The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
-
-## For More Information
-
- **LLS Agents API**: For detailed information on creating and managing agents, see the [Agents documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html)
- **OpenAI Responses API**: For information on using the OpenAI-compatible responses API, see the [OpenAI API documentation](https://platform.openai.com/docs/api-reference/responses)
- **Chat Completions API**: For the default backend API used by Agents, see the [Chat Completions providers documentation](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions)
- **Agent Execution Loop**: For understanding how agents process turns and steps in their execution, see the [Agent Execution Loop documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent_execution_loop.html)
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -24,106 +24,37 @@ structured_log_event = SpanStartPayload(name="my_span", parent_span_id="parent_s
 - **Spans**: Represent operations with timing and hierarchical relationships
 - **Traces**: Collection of related spans forming a complete request flow

-### Metrics
-
-Llama Stack automatically generates metrics during inference operations. These metrics are aggregated at the **inference request level** and provide insights into token usage and model performance.
-
-#### Available Metrics
-
-The following metrics are automatically generated for each inference request:
-
-| Metric Name | Type | Unit | Description | Labels |
-|-------------|------|------|-------------|--------|
-| `llama_stack_prompt_tokens_total` | Counter | `tokens` | Number of tokens in the input prompt | `model_id`, `provider_id` |
-| `llama_stack_completion_tokens_total` | Counter | `tokens` | Number of tokens in the generated response | `model_id`, `provider_id` |
-| `llama_stack_tokens_total` | Counter | `tokens` | Total tokens used (prompt + completion) | `model_id`, `provider_id` |
-
-#### Metric Generation Flow
-
-1. **Token Counting**: During inference operations (chat completion, completion, etc.), the system counts tokens in both input prompts and generated responses
-2. **Metric Construction**: For each request, `MetricEvent` objects are created with the token counts
-3. **Telemetry Logging**: Metrics are sent to the configured telemetry sinks
-4. **OpenTelemetry Export**: When OpenTelemetry is enabled, metrics are exposed as standard OpenTelemetry counters
-
-#### Metric Aggregation Level
-
-All metrics are generated and aggregated at the **inference request level**. This means:
-
- Each individual inference request generates its own set of metrics
- Metrics are not pre-aggregated across multiple requests
- Aggregation (sums, averages, etc.) can be performed by your observability tools (Prometheus, Grafana, etc.)
- Each metric includes labels for `model_id` and `provider_id` to enable filtering and grouping
-
-#### Example Metric Event
-
-```python
-MetricEvent(
-    trace_id="1234567890abcdef",
-    span_id="abcdef1234567890",
-    metric="total_tokens",
-    value=150,
-    timestamp=1703123456.789,
-    unit="tokens",
-    attributes={"model_id": "meta-llama/Llama-3.2-3B-Instruct", "provider_id": "tgi"},
-)
-```
-
-#### Querying Metrics
-
-When using the OpenTelemetry sink, metrics are exposed in standard OpenTelemetry format and can be queried through:
-
- **Prometheus**: Scrape metrics from the OpenTelemetry Collector's metrics endpoint
- **Grafana**: Create dashboards using Prometheus as a data source
- **OpenTelemetry Collector**: Forward metrics to other observability systems
-
-Example Prometheus queries:
-```promql
-# Total tokens used across all models
-sum(llama_stack_tokens_total)
-
-# Tokens per model
-sum by (model_id) (llama_stack_tokens_total)
-
-# Average tokens per request
-rate(llama_stack_tokens_total[5m])
-```
-
 ### Sinks
- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a tool like Jaeger and collecting metrics for Prometheus.
+- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a tool like Jaeger.
 - **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API.
 - **Console**: Print events to the console.

 ### Providers

 #### Meta-Reference Provider
-Currently, only the meta-reference provider is implemented. It can be configured to send events to multiple sink types:
-1) OpenTelemetry Collector (traces and metrics)
-2) SQLite (traces only)
-3) Console (all events)
+Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types:
+1) OpenTelemetry Collector
+2) SQLite
+3) Console

 #### Configuration

-Here's an example that sends telemetry signals to all sink types. Your configuration might use only one or a subset.
-
+Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one.
 ```yaml
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "llama-stack-service"
      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
-      otel_exporter_otlp_endpoint: "http://localhost:4318"
+      otel_trace_endpoint: "http://localhost:4318/v1/traces"
+      otel_metric_endpoint: "http://localhost:4318/v1/metrics"
      sqlite_db_path: "/path/to/telemetry.db"
 ```

-**Environment Variables:**
- `OTEL_EXPORTER_OTLP_ENDPOINT`: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
- `OTEL_SERVICE_NAME`: Service name for telemetry (default: empty string)
- `TELEMETRY_SINKS`: Comma-separated list of sinks (default: `console,sqlite`)
-
 ### Jaeger to visualize traces

-The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
+The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
+Let's use Jaeger to visualize this data.

 Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:

@ -137,7 +68,4 @@ Once the Jaeger instance is running, you can visualize traces by navigating to h

 ### Querying Traces Stored in SQLite

-The `sqlite` sink allows you to query traces without an external system. Here are some example
-queries. Refer to the notebook at [Llama Stack Building AI
-Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for
-more examples on how to query traces and spans.
+The `sqlite` sink allows you to query traces without an external system. Here are some example queries. Refer to the notebook at [Llama Stack Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on how to query traces and spaces.
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -76,9 +76,7 @@ Features:
 - Context retrieval with token limits


-```{note}
-By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
-```
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.

 ## Model Context Protocol (MCP)

--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -10,12 +10,9 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
 - **Telemetry**: collect telemetry data from the system
- **Post Training**: fine-tune a model
- **Tool Runtime**: interact with various tools and protocols
- **Responses**: generate responses from an LLM using this OpenAI compatible API.

 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
+- **Post Training**: fine-tune a Llama model
 - **Synthetic Data Generation**: generate synthetic data for model development
- **Batches**: OpenAI-compatible batch management for inference
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
@ -43,7 +43,7 @@ We have built-in functionality to run the supported open-benckmarks using llama-

 Spin up llama stack server with 'open-benchmark' template
 ```
-llama stack run llama_stack/distributions/open-benchmark/run.yaml
+llama stack run llama_stack/templates/open-benchmark/run.yaml

 ```

--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -2,10 +2,6 @@

 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.

-```{include} architecture.md
-:start-after: ## Llama Stack architecture
-```
-
 ```{include} apis.md
 :start-after: ## APIs
 ```
@ -14,10 +10,14 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
 :start-after: ## API Providers
 ```

+```{include} resources.md
+:start-after: ## Resources
+```
+
 ```{include} distributions.md
 :start-after: ## Distributions
 ```

-```{include} resources.md
-:start-after: ## Resources
+```{include} evaluation_concepts.md
+:start-after: ## Evaluation Concepts
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -52,18 +52,7 @@ extensions = [
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
-    "sphinx_reredirects"
 ]
-
-redirects = {
-    "providers/post_training/index": "../../advanced_apis/post_training/index.html",
-    "providers/eval/index": "../../advanced_apis/eval/index.html",
-    "providers/scoring/index": "../../advanced_apis/scoring/index.html",
-    "playground/index": "../../building_applications/playground/index.html",
-    "openai/index": "../../providers/index.html#openai-api-compatibility",
-    "introduction/index": "../concepts/index.html#llama-stack-architecture"
-}
-
 myst_enable_extensions = ["colon_fence"]

 html_theme = "sphinx_rtd_theme"
@ -131,7 +120,6 @@ html_static_path = ["../_static"]
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
-    app.add_js_file("js/keyboard_shortcuts.js")

    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -2,38 +2,13 @@
 ```{include} ../../../CONTRIBUTING.md
 ```

-## Adding a New Provider
+See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
+

-See:
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.

 ```{toctree}
 :maxdepth: 1
 :hidden:

 new_api_provider
-new_vector_database
-```
-
-## Testing
-
-
-```{include} ../../../tests/README.md
-```
-
-## Advanced Topics
-
-For developers who need deeper understanding of the testing system internals:
-
-```{toctree}
-:maxdepth: 1
-
-testing/record-replay
-```
-
-### Benchmarking
-
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
 ```
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.


 Here are some example PRs to help you get started:
@ -14,45 +14,10 @@ Here are some example PRs to help you get started:
   - [Nvidia Inference Implementation](https://github.com/meta-llama/llama-stack/pull/355)
   - [Model context protocol Tool Runtime](https://github.com/meta-llama/llama-stack/pull/665)

-## Inference Provider Patterns
-
-When implementing Inference providers for OpenAI-compatible APIs, Llama Stack provides several mixin classes to simplify development and ensure consistent behavior across providers.
-
-### OpenAIMixin
-
-The `OpenAIMixin` class provides direct OpenAI API functionality for providers that work with OpenAI-compatible endpoints. It includes:
-
-#### Direct API Methods
- **`openai_completion()`**: Legacy text completion API with full parameter support
- **`openai_chat_completion()`**: Chat completion API supporting streaming, tools, and function calling
- **`openai_embeddings()`**: Text embeddings generation with customizable encoding and dimensions
-
-#### Model Management
- **`check_model_availability()`**: Queries the API endpoint to verify if a model exists and is accessible
-
-#### Client Management
- **`client` property**: Automatically creates and configures AsyncOpenAI client instances using your provider's credentials
-
-#### Required Implementation
-
-To use `OpenAIMixin`, your provider must implement these abstract methods:
-
-```python
-@abstractmethod
-def get_api_key(self) -> str:
-    """Return the API key for authentication"""
-    pass
-
-
-@abstractmethod
-def get_base_url(self) -> str:
-    """Return the OpenAI-compatible API base URL"""
-    pass
-```

 ## Testing the Provider

-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.

 ### 1. Integration Testing

--- a/docs/source/contributing/new_vector_database.md
+++ b/docs/source/contributing/new_vector_database.md
@ -1,75 +0,0 @@
-# Adding a New Vector Database
-
-This guide will walk you through the process of adding a new vector database to Llama Stack.
-
-> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
-
-Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
-search but can support keyword and hybrid search. Additionally, vector database can also support operations like
-filtering, sorting, and aggregating vectors.
-
-## Steps to Add a New Vector Database Provider
-1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
-   - Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
-2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
-   - Implement methods for vector storage, retrieval, search, and any additional features your database supports.
-     - You will need to implement the following methods for `YourVectorIndex`:
-        - `YourVectorIndex.create()`
-        - `YourVectorIndex.initialize()`
-        - `YourVectorIndex.add_chunks()`
-        - `YourVectorIndex.delete_chunk()`
-        - `YourVectorIndex.query_vector()`
-        - `YourVectorIndex.query_keyword()`
-        - `YourVectorIndex.query_hybrid()`
-     - You will need to implement the following methods for `YourVectorIOAdapter`:
-        - `YourVectorIOAdapter.initialize()`
-        - `YourVectorIOAdapter.shutdown()`
-        - `YourVectorIOAdapter.list_vector_dbs()`
-        - `YourVectorIOAdapter.register_vector_db()`
-        - `YourVectorIOAdapter.unregister_vector_db()`
-        - `YourVectorIOAdapter.insert_chunks()`
-        - `YourVectorIOAdapter.query_chunks()`
-        - `YourVectorIOAdapter.delete_chunks()`
-3. **Add to Registry**: Register your provider in the appropriate registry file.
-   - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
-```python
-from llama_stack.providers.registry.specs import InlineProviderSpec
-from llama_stack.providers.registry.api import Api
-
-InlineProviderSpec(
-    api=Api.vector_io,
-    provider_type="inline::milvus",
-    pip_packages=["pymilvus>=2.4.10"],
-    module="llama_stack.providers.inline.vector_io.milvus",
-    config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
-    api_dependencies=[Api.inference],
-    optional_api_dependencies=[Api.files],
-    description="",
-),
-```
-4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
-   - Unit Tests
-     - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
-       1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
-       2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
-       3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
-       4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
-       5. Add your provider to the `vector_io_providers` fixture dictionary.
-         - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
-   - Integration Tests
-     - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
-     - The two set of integration tests are:
-       - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
-       - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
-        - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
-     - Running the tests in the GitHub CI
-       - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
-        - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
-   - Updating the pyproject.yml
-     - If you are adding tests for the `inline` provider you will have to update the `unit` group.
-       - `uv add new_pip_package --group unit`
-     - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
-       - `uv add new_pip_package --group test`
-5. **Update Documentation**: Please update the documentation for end users
-   - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
-   - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -0,0 +1,6 @@
+# Testing Llama Stack
+
+Tests are of three different kinds:
+- Unit tests
+- Provider focused integration tests
+- Client SDK tests
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -1,234 +0,0 @@
-# Record-Replay System
-
-Understanding how Llama Stack captures and replays API interactions for testing.
-
-## Overview
-
-The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
-
-The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
-
-## How It Works
-
-### Request Hashing
-
-Every API request gets converted to a deterministic hash for lookup:
-
-```python
-def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
-    normalized = {
-        "method": method.upper(),
-        "endpoint": urlparse(url).path,  # Just the path, not full URL
-        "body": body,  # Request parameters
-    }
-    return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
-```
-
-**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
-
-```python
-# These produce DIFFERENT hashes:
-{"content": "Hello world"}
-{"content": "Hello   world\n"}
-{"temperature": 0.7}
-{"temperature": 0.7000001}
-```
-
-### Client Interception
-
-The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
-
-### Storage Architecture
-
-Recordings use a two-tier storage system optimized for both speed and debuggability:
-
-```
-recordings/
-├── index.sqlite          # Fast lookup by request hash
-└── responses/
-    ├── abc123def456.json  # Individual response files
-    └── def789ghi012.json
-```
-
-**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
-
-**JSON files** store complete request/response pairs in human-readable format for debugging.
-
-## Recording Modes
-
-### LIVE Mode
-
-Direct API calls with no recording or replay:
-
-```python
-with inference_recording(mode=InferenceMode.LIVE):
-    response = await client.chat.completions.create(...)
-```
-
-Use for initial development and debugging against real APIs.
-
-### RECORD Mode
-
-Captures API interactions while passing through real responses:
-
-```python
-with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
-    response = await client.chat.completions.create(...)
-    # Real API call made, response captured AND returned
-```
-
-The recording process:
-1. Request intercepted and hashed
-2. Real API call executed
-3. Response captured and serialized
-4. Recording stored to disk
-5. Original response returned to caller
-
-### REPLAY Mode
-
-Returns stored responses instead of making API calls:
-
-```python
-with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
-    response = await client.chat.completions.create(...)
-    # No API call made, cached response returned instantly
-```
-
-The replay process:
-1. Request intercepted and hashed
-2. Hash looked up in SQLite index
-3. Response loaded from JSON file
-4. Response deserialized and returned
-5. Error if no recording found
-
-## Streaming Support
-
-Streaming APIs present a unique challenge: how do you capture an async generator?
-
-### The Problem
-
-```python
-# How do you record this?
-async for chunk in client.chat.completions.create(stream=True):
-    process(chunk)
-```
-
-### The Solution
-
-The system captures all chunks immediately before yielding any:
-
-```python
-async def handle_streaming_record(response):
-    # Capture complete stream first
-    chunks = []
-    async for chunk in response:
-        chunks.append(chunk)
-
-    # Store complete recording
-    storage.store_recording(
-        request_hash, request_data, {"body": chunks, "is_streaming": True}
-    )
-
-    # Return generator that replays captured chunks
-    async def replay_stream():
-        for chunk in chunks:
-            yield chunk
-
-    return replay_stream()
-```
-
-This ensures:
- **Complete capture** - The entire stream is saved atomically
- **Interface preservation** - The returned object behaves like the original API
- **Deterministic replay** - Same chunks in the same order every time
-
-## Serialization
-
-API responses contain complex Pydantic objects that need careful serialization:
-
-```python
-def _serialize_response(response):
-    if hasattr(response, "model_dump"):
-        # Preserve type information for proper deserialization
-        return {
-            "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
-            "__data__": response.model_dump(mode="json"),
-        }
-    return response
-```
-
-This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
-
-## Environment Integration
-
-### Environment Variables
-
-Control recording behavior globally:
-
-```bash
-export LLAMA_STACK_TEST_INFERENCE_MODE=replay
-export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
-pytest tests/integration/
-```
-
-### Pytest Integration
-
-The system integrates automatically based on environment variables, requiring no changes to test code.
-
-## Debugging Recordings
-
-### Inspecting Storage
-
-```bash
-# See what's recorded
-sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
-
-# View specific response
-cat recordings/responses/abc123def456.json | jq '.response.body'
-
-# Find recordings by endpoint
-sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
-```
-
-### Common Issues
-
-**Hash mismatches:** Request parameters changed slightly between record and replay
-```bash
-# Compare request details
-cat recordings/responses/abc123.json | jq '.request'
-```
-
-**Serialization errors:** Response types changed between versions
-```bash
-# Re-record with updated types
-rm recordings/responses/failing_hash.json
-LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
-```
-
-**Missing recordings:** New test or changed parameters
-```bash
-# Record the missing interaction
-LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
-```
-
-## Design Decisions
-
-### Why Not Mocks?
-
-Traditional mocking breaks down with AI APIs because:
- Response structures are complex and evolve frequently
- Streaming behavior is hard to mock correctly
- Edge cases in real APIs get missed
- Mocks become brittle maintenance burdens
-
-### Why Precise Hashing?
-
-Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
-
-### Why JSON + SQLite?
-
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
- **SQLite** - Fast indexed lookups without loading response bodies
- **Hybrid** - Best of both worlds for different use cases
-
-This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
--- a/docs/source/deploying/index.md
+++ b/docs/source/deploying/index.md
@ -1,4 +0,0 @@
-# Deployment Examples
-
-```{include} kubernetes_deployment.md
-```
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -47,37 +47,31 @@ pip install -e .
 ```
 Use the CLI to build your distribution.
 The main points to consider are:
-1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
+1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
 2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
 3. **Config** - Do you want to use a pre-existing config file to build your distribution?

 ```
 llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
-                         [--run] [--providers PROVIDERS]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]

 Build a Llama stack container

 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
-                        enter information interactively (default: None)
-  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
-                        None)
-  --distro DISTRIBUTION, --distribution DISTRIBUTION
-                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
-  --list-distros, --list-distributions
-                        Show the available distributions for building a Llama Stack distribution (default: False)
-  --image-type {container,venv}
-                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
+                        be prompted to enter information interactively (default: None)
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
+  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
+  --image-type {conda,container,venv}
+                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
+                        conda)
  --image-name IMAGE_NAME
-                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
-                        None)
+                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
+                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
-  --providers PROVIDERS
-                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
-                        API. (default: None)
+
 ```

 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
@ -95,52 +89,31 @@ llama stack build --list-templates
 ------------------------------+-----------------------------------------------------------------------------+
 | Template Name                | Description                                                                 |
 +------------------------------+-----------------------------------------------------------------------------+
-| watsonx                      | Use watsonx for running LLM inference                                       |
-+------------------------------+-----------------------------------------------------------------------------+
-| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
+| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 | together                     | Use Together.AI for running LLM inference                                   |
 +------------------------------+-----------------------------------------------------------------------------+
-| tgi                          | Use (an external) TGI server for running LLM inference                      |
-+------------------------------+-----------------------------------------------------------------------------+
-| starter                      | Quick start template for running Llama Stack with several popular providers |
-+------------------------------+-----------------------------------------------------------------------------+
-| sambanova                    | Use SambaNova for running LLM inference and safety                          |
-+------------------------------+-----------------------------------------------------------------------------+
-| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
-+------------------------------+-----------------------------------------------------------------------------+
-| postgres-demo                | Quick start template for running Llama Stack with several popular providers |
-+------------------------------+-----------------------------------------------------------------------------+
-| passthrough                  | Use Passthrough hosted llama-stack endpoint for LLM inference               |
-+------------------------------+-----------------------------------------------------------------------------+
-| open-benchmark               | Distribution for running open benchmarks                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| ollama                       | Use (an external) Ollama server for running LLM inference                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| nvidia                       | Use NVIDIA NIM for running LLM inference, evaluation and safety             |
-+------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
-+------------------------------+-----------------------------------------------------------------------------+
-| llama_api                    | Distribution for running e2e tests in CI                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-| groq                         | Use Groq for running LLM inference                                          |
-+------------------------------+-----------------------------------------------------------------------------+
-| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
+| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
 +------------------------------+-----------------------------------------------------------------------------+
 | experimental-post-training   | Experimental template for post training                                     |
 +------------------------------+-----------------------------------------------------------------------------+
-| dell                         | Dell's distribution of Llama Stack. TGI inference via Dell's custom         |
-|                              | container                                                                   |
+| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
 +------------------------------+-----------------------------------------------------------------------------+
-| ci-tests                     | Distribution for running e2e tests in CI                                    |
+| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
+------------------------------+-----------------------------------------------------------------------------+
+| tgi                          | Use (an external) TGI server for running LLM inference                      |
+------------------------------+-----------------------------------------------------------------------------+
+| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
+------------------------------+-----------------------------------------------------------------------------+
+| nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
-| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+| ollama                       | Use (an external) Ollama server for running LLM inference                   |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 ```

@ -148,13 +121,9 @@ You may then pick a template to build your distribution with providers fitted to

 For example, to build a distribution with TGI as the inference provider, you can run:
 ```
-$ llama stack build --distro starter
+$ llama stack build --template tgi
 ...
-You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
-```
-
-```{tip}
-The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
 ```
 :::
 :::{tab-item} Building from Scratch
@ -166,7 +135,7 @@ It would be best to start with a template and understand the structure of the co
 llama stack build

 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (container or venv): venv
+> Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda

 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
@ -191,10 +160,29 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 :::{tab-item} Building from a pre-existing build config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.

- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
+- The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.

 ```
-llama stack build --config llama_stack/distributions/starter/build.yaml
+$ cat llama_stack/templates/ollama/build.yaml
+
+name: ollama
+distribution_spec:
+  description: Like local, but use ollama for running LLM inference
+  providers:
+    inference: remote::ollama
+    memory: inline::faiss
+    safety: inline::llama-guard
+    agents: inline::meta-reference
+    telemetry: inline::meta-reference
+image_name: ollama
+image_type: conda
+
+# If some providers are external, you can specify the path to the implementation
+external_providers_dir: ~/.llama/providers.d
+```
+
+```
+llama stack build --config llama_stack/templates/ollama/build.yaml
 ```
 :::

@ -260,15 +248,14 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.

 ```
-llama stack build --distro starter --image-type container
+llama stack build --template ollama --image-type container
 ```

 ```
-$ llama stack build --distro starter --image-type container
+$ llama stack build --template ollama --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
-```

 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```
@ -318,27 +305,29 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con

 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
-                       [--image-type {venv}] [--enable-ui]
-                       [config | template]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+                       [--image-type {conda,container,venv}]
+                       config

 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.

 positional arguments:
-  config | template     Path to config file to use for the run or name of known template (`llama stack list` for a list). (default: None)
+  config                Path to config file to use for the run

 options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current environment (default: None)
-  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
-  --image-type {venv}
-                        Image Type used during the build. This should be venv. (default: None)
-  --enable-ui           Start the UI server (default: False)
-```
+  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
+  --tls-keyfile TLS_KEYFILE
+                        Path to TLS key file for HTTPS (default: None)
+  --tls-certfile TLS_CERTFILE
+                        Path to TLS certificate file for HTTPS (default: None)
+  --image-type {conda,container,venv}
+                        Image Type used during the build. This can be either conda or container or venv. (default: conda)

-**Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
+```

 ```
 # Start using template name
@ -349,6 +338,9 @@ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-

 # Start using a venv
 llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+# Start using a conda environment
+llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```

 ```
@ -380,7 +372,6 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
-
 ### Listing Distributions
 Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.

@ -400,20 +391,6 @@ Example Usage
 llama stack list
 ```

-```
------------------------------+-----------------------------------------------------------------+--------------+------------+
-| Stack Name                  | Path                                                            | Build Config | Run Config |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| together                    | ~/.llama/distributions/together                                 | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-```
-
 ### Removing a Distribution
 Use the remove command to delete a distribution you've previously built.

@ -436,7 +413,7 @@ Example
 llama stack rm llamastack-test
 ```

-To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.

 ### Troubleshooting

--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -2,14 +2,11 @@

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

-```{note}
-The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
-```
-
 ```{dropdown} 👋 Click here for a Sample Configuration File

 ```yaml
 version: 2
+conda_env: ollama
 apis:
 - agents
 - inference
@ -59,8 +56,8 @@ shields: []
 server:
  port: 8321
  auth:
-    provider_config:
-      type: "oauth2_token"
+    provider_type: "oauth2_token"
+    config:
      jwks:
        uri: "https://my-token-issuing-svc.com/jwks"
 ```
@ -70,7 +67,7 @@ Let's break this down into the different sections. The first section specifies t
 apis:
 - agents
 - inference
- vector_io
+- memory
 - safety
 - telemetry
 ```
@ -83,7 +80,7 @@ providers:
  # provider_id is a string you can choose freely
  - provider_id: ollama
    # provider_type is a string that specifies the type of provider.
-    # in this case, the provider for inference is ollama and it runs remotely (outside of the distribution)
+    # in this case, the provider for inference is ollama and it is run remotely (outside of the distribution)
    provider_type: remote::ollama
    # config is a dictionary that contains the configuration for the provider.
    # in this case, the configuration is the url of the ollama server
@ -91,7 +88,7 @@ providers:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
 ```
 A few things to note:
- A _provider instance_ is identified with an (id, type, config) triplet.
+- A _provider instance_ is identified with an (id, type, configuration) triplet.
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
@ -128,7 +125,7 @@ config:
 ```

 If the environment variable is not set, the default value `http://localhost:11434` will be used.
-Empty defaults are allowed so `url: ${env.OLLAMA_URL:=}` will be set to `None` if the environment variable is not set.
+Empty defaults are not allowed so `url: ${env.OLLAMA_URL:=}` will raise an error if the environment variable is not set.

 #### Conditional Values

@ -142,10 +139,8 @@ config:

 If the environment variable is set, the value after `:+` will be used. If it's not set, the field
 will be omitted with a `None` value.
-
-Do not use conditional values (`${env.OLLAMA_URL:+}`) for empty defaults (`${env.OLLAMA_URL:=}`).
-This will be set to `None` if the environment variable is not set.
-Conditional must only be used when the environment variable is set.
+So `${env.ENVIRONMENT:+}` is supported, it means that the field will be omitted if the environment
+variable is not set. It can be used to make a field optional and then enabled at runtime when desired.

 #### Examples

@ -190,7 +185,7 @@ The environment variable substitution system is type-safe:

 ## Resources

-Let's look at the `models` section:
+Finally, let's look at the `models` section:

 ```yaml
 models:
@ -198,9 +193,8 @@ models:
  model_id: ${env.INFERENCE_MODEL}
  provider_id: ollama
  provider_model_id: null
-  model_type: llm
 ```
-A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
+A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to always register models before using them, some Stack servers may come up a list of "already known and available" models.

 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.

@ -229,8 +223,6 @@ server:

 ### Authentication Configuration

-> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
-
 The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:

 ```
@ -245,8 +237,8 @@ The server can be configured to use service account tokens for authorization, va
 ```yaml
 server:
  auth:
-    provider_config:
-      type: "oauth2_token"
+    provider_type: "oauth2_token"
+    config:
      jwks:
        uri: "https://kubernetes.default.svc:8443/openid/v1/jwks"
        token: "${env.TOKEN:+}"
@ -330,25 +322,13 @@ You can easily validate a request by running:
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```

-#### GitHub Token Provider
-Validates GitHub personal access tokens or OAuth tokens directly:
-```yaml
-server:
-  auth:
-    provider_config:
-      type: "github_token"
-      github_api_base_url: "https://api.github.com"  # Or GitHub Enterprise URL
-```
-
-The provider fetches user information from GitHub and maps it to access attributes based on the `claims_mapping` configuration.
-
 #### Custom Provider
 Validates tokens against a custom authentication endpoint:
 ```yaml
 server:
  auth:
-    provider_config:
-      type: "custom"
+    provider_type: "custom"
+    config:
      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
 ```

@ -384,166 +364,6 @@ And must respond with:

 If no access attributes are returned, the token is used as a namespace.

-### Access control
-
-When authentication is enabled, access to resources is controlled
-through the `access_policy` attribute of the auth config section under
-server. The value for this is a list of access rules.
-
-Each access rule defines a list of actions either to permit or to
-forbid. It may specify a principal or a resource that must match for
-the rule to take effect.
-
-Valid actions are create, read, update, and delete. The resource to
-match should be specified in the form of a type qualified identifier,
-e.g.  model::my-model or vector_db::some-db, or a wildcard for all
-resources of a type, e.g. model::*. If the principal or resource are
-not specified, they will match all requests.
-
-The valid resource types are model, shield, vector_db, dataset,
-scoring_function, benchmark, tool, tool_group and session.
-
-A rule may also specify a condition, either a 'when' or an 'unless',
-with additional constraints as to where the rule applies. The
-constraints supported at present are:
-
- - 'user with <attr-value> in <attr-name>'
- - 'user with <attr-value> not in <attr-name>'
- - 'user is owner'
- - 'user is not owner'
- - 'user in owners <attr-name>'
- - 'user not in owners <attr-name>'
-
-The attributes defined for a user will depend on how the auth
-configuration is defined.
-
-When checking whether a particular action is allowed by the current
-user for a resource, all the defined rules are tested in order to find
-a match. If a match is found, the request is permitted or forbidden
-depending on the type of rule. If no match is found, the request is
-denied.
-
-If no explicit rules are specified, a default policy is defined with
-which all users can access all resources defined in config but
-resources created dynamically can only be accessed by the user that
-created them.
-
-Examples:
-
-The following restricts access to particular github users:
-
-```yaml
-server:
-  auth:
-    provider_config:
-      type: "github_token"
-      github_api_base_url: "https://api.github.com"
-  access_policy:
-  - permit:
-      principal: user-1
-      actions: [create, read, delete]
-    description: user-1 has full access to all resources
-  - permit:
-      principal: user-2
-      actions: [read]
-      resource: model::model-1
-    description: user-2 has read access to model-1 only
-```
-
-Similarly, the following restricts access to particular kubernetes
-service accounts:
-
-```yaml
-server:
-  auth:
-    provider_config:
-      type: "oauth2_token"
-      audience: https://kubernetes.default.svc.cluster.local
-      issuer: https://kubernetes.default.svc.cluster.local
-      tls_cafile: /home/gsim/.minikube/ca.crt
-      jwks:
-        uri: https://kubernetes.default.svc.cluster.local:8443/openid/v1/jwks
-        token: ${env.TOKEN}
-    access_policy:
-    - permit:
-        principal: system:serviceaccount:my-namespace:my-serviceaccount
-        actions: [create, read, delete]
-      description: specific serviceaccount has full access to all resources
-    - permit:
-        principal: system:serviceaccount:default:default
-        actions: [read]
-        resource: model::model-1
-      description: default account has read access to model-1 only
-```
-
-The following policy, which assumes that users are defined with roles
-and teams by whichever authentication system is in use, allows any
-user with a valid token to use models, create resources other than
-models, read and delete resources they created and read resources
-created by users sharing a team with them:
-
-```
-    access_policy:
-    - permit:
-        actions: [read]
-        resource: model::*
-      description: all users have read access to models
-    - forbid:
-        actions: [create, delete]
-        resource: model::*
-      unless: user with admin in roles
-      description: only user with admin role can create or delete models
-    - permit:
-        actions: [create, read, delete]
-      when: user is owner
-      description: users can create resources other than models and read and delete those they own
-    - permit:
-        actions: [read]
-      when: user in owner teams
-      description: any user has read access to any resource created by a user with the same team
-```
-
-#### API Endpoint Authorization with Scopes
-
-In addition to resource-based access control, Llama Stack supports endpoint-level authorization using OAuth 2.0 style scopes. When authentication is enabled, specific API endpoints require users to have particular scopes in their authentication token.
-
-**Scope-Gated APIs:**
-The following APIs are currently gated by scopes:
-
- **Telemetry API** (scope: `telemetry.read`):
-  - `POST /telemetry/traces` - Query traces
-  - `GET /telemetry/traces/{trace_id}` - Get trace by ID
-  - `GET /telemetry/traces/{trace_id}/spans/{span_id}` - Get span by ID
-  - `POST /telemetry/spans/{span_id}/tree` - Get span tree
-  - `POST /telemetry/spans` - Query spans
-  - `POST /telemetry/metrics/{metric_name}` - Query metrics
-
-**Authentication Configuration:**
-
-For **JWT/OAuth2 providers**, scopes should be included in the JWT's claims:
-```json
-{
-  "sub": "user123",
-  "scope": "telemetry.read",
-  "aud": "llama-stack"
-}
-```
-
-For **custom authentication providers**, the endpoint must return user attributes including the `scopes` array:
-```json
-{
-  "principal": "user123",
-  "attributes": {
-    "scopes": ["telemetry.read"]
-  }
-}
-```
-
-**Behavior:**
- Users without the required scope receive a 403 Forbidden response
- When authentication is disabled, scope checks are bypassed
- Endpoints without `required_scope` work normally for all authenticated users
-
 ### Quota Configuration

 The `quota` section allows you to enable server-side request throttling for both
@ -593,8 +413,8 @@ clients.
 server:
  port: 8321
  auth:
-    provider_config:
-      type: custom
+    provider_type: custom
+    config:
      endpoint: https://auth.example.com/validate
  quota:
    kvstore:
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -1,40 +0,0 @@
-# Customizing run.yaml Files
-
-The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
-
-## Key Points
-
- **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
- **Customization expected**: Update URLs, credentials, models, and settings for your environment
- **Version control separately**: Keep customized configs in your own repository
- **Environment-specific**: Create different configurations for dev, staging, production
-
-## What You Can Customize
-
-You can customize:
- **Provider endpoints**: Change `http://localhost:8000` to your actual servers
- **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
- **Storage paths**: Move from `/tmp/` to production directories
- **Authentication**: Add API keys, SSL, timeouts
- **Models**: Different model sizes for dev vs prod
- **Database settings**: Switch from SQLite to PostgreSQL
- **Tool configurations**: Add custom tools and integrations
-
-## Best Practices
-
- Use environment variables for secrets and environment-specific values
- Create separate `run.yaml` files for different environments (dev, staging, prod)
- Document your changes with comments
- Test configurations before deployment
- Keep your customized configs in version control
-
-Example structure:
-```
-your-project/
-├── configs/
-│   ├── dev-run.yaml
-│   ├── prod-run.yaml
-└── README.md
-```
-
-The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-set -euo pipefail
-
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-K8S_DIR="${SCRIPT_DIR}/../k8s"
-
-echo "Setting up AWS EKS-specific storage class..."
-kubectl apply -f gp3-topology-aware.yaml
-
-echo "Running main Kubernetes deployment..."
-cd "${K8S_DIR}"
-./apply.sh "$@"
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
@ -1,15 +0,0 @@
-# Set up default storage class on AWS EKS
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: gp3-topology-aware
-  annotations:
-    storageclass.kubernetes.io/is-default-class: "true"
-parameters:
-  type: gp3
-  iops: "3000"
-  throughput: "125"
-provisioner: ebs.csi.aws.com
-reclaimPolicy: Delete
-volumeBindingMode: WaitForFirstConsumer
-allowVolumeExpansion: true
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -6,14 +6,14 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --distro starter --image-type venv
+llama stack build --template ollama --image-type venv
 ```

 ```python
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient

 client = LlamaStackAsLibraryClient(
-    "starter",
+    "ollama",
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -6,10 +6,13 @@ This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
 :maxdepth: 3
-list_of_distributions
-building_distro
-customizing_run_yaml
-starting_llama_stack_server
+
 importing_as_library
 configuration
+list_of_distributions
+kubernetes_deployment
+building_distro
+on_device_distro
+remote_hosted_distro
+self_hosted_distro
 ```
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -1,156 +0,0 @@
-# Llama Stack Benchmark Suite on Kubernetes
-
-## Motivation
-
-Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
-
-### Why This Benchmark Suite Exists
-
-**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
- Llama Stack inference (with vLLM backend)
- Direct vLLM inference calls
- Both under identical Kubernetes deployment conditions
-
-**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
-
-**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
-
-**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
- Kubernetes resource allocation (CPU, memory, GPU)
- Auto-scaling configurations
- Cost optimization strategies
-
-### Key Metrics Captured
-
-The benchmark suite measures critical performance indicators:
- **Throughput**: Requests per second under sustained load
- **Latency Distribution**: P50, P95, P99 response times
- **Time to First Token (TTFT)**: Critical for streaming applications
- **Error Rates**: Request failures and timeout analysis
-
-This data enables data-driven architectural decisions and performance optimization efforts.
-
-## Setup
-
-**1. Deploy base k8s infrastructure:**
-```bash
-cd ../k8s
-./apply.sh
-```
-
-**2. Deploy benchmark components:**
-```bash
-cd ../k8s-benchmark
-./apply.sh
-```
-
-**3. Verify deployment:**
-```bash
-kubectl get pods
-# Should see: llama-stack-benchmark-server, vllm-server, etc.
-```
-
-## Quick Start
-
-### Basic Benchmarks
-
-**Benchmark Llama Stack (default):**
-```bash
-cd docs/source/distributions/k8s-benchmark/
-./run-benchmark.sh
-```
-
-**Benchmark vLLM direct:**
-```bash
-./run-benchmark.sh --target vllm
-```
-
-### Custom Configuration
-
-**Extended benchmark with high concurrency:**
-```bash
-./run-benchmark.sh --target vllm --duration 120 --concurrent 20
-```
-
-**Short test run:**
-```bash
-./run-benchmark.sh --target stack --duration 30 --concurrent 5
-```
-
-## Command Reference
-
-### run-benchmark.sh Options
-
-```bash
-./run-benchmark.sh [options]
-
-Options:
-  -t, --target <stack|vllm>     Target to benchmark (default: stack)
-  -d, --duration <seconds>      Duration in seconds (default: 60)
-  -c, --concurrent <users>      Number of concurrent users (default: 10)
-  -h, --help                    Show help message
-
-Examples:
-  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
-  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
-  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
-```
-
-## Local Testing
-
-### Running Benchmark Locally
-
-For local development without Kubernetes:
-
-**1. Start OpenAI mock server:**
-```bash
-uv run python openai-mock-server.py --port 8080
-```
-
-**2. Run benchmark against mock server:**
-```bash
-uv run python benchmark.py \
-  --base-url http://localhost:8080/v1 \
-  --model mock-inference \
-  --duration 30 \
-  --concurrent 5
-```
-
-**3. Test against local vLLM server:**
-```bash
-# If you have vLLM running locally on port 8000
-uv run python benchmark.py \
-  --base-url http://localhost:8000/v1 \
-  --model meta-llama/Llama-3.2-3B-Instruct \
-  --duration 30 \
-  --concurrent 5
-```
-
-**4. Profile the running server:**
-```bash
-./profile_running_server.sh
-```
-
-
-
-### OpenAI Mock Server
-
-The `openai-mock-server.py` provides:
- **OpenAI-compatible API** for testing without real models
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
- **Consistent responses** for reproducible benchmarks
- **Lightweight testing** without GPU requirements
-
-**Mock server usage:**
-```bash
-uv run python openai-mock-server.py --port 8080
-```
-
-The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
-
-## Files in this Directory
-
- `benchmark.py` - Core benchmark script with async streaming support
- `run-benchmark.sh` - Main script with target selection and configuration
- `openai-mock-server.py` - Mock OpenAI API server for local testing
- `README.md` - This documentation file
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
-
-export STREAM_DELAY_SECONDS=0.005
-
-export POSTGRES_USER=llamastack
-export POSTGRES_DB=llamastack
-export POSTGRES_PASSWORD=llamastack
-
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-export MOCK_INFERENCE_MODEL=mock-inference
-
-export MOCK_INFERENCE_URL=openai-mock-service:8080
-
-export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
-
-set -euo pipefail
-set -x
-
-# Deploy benchmark-specific components
-kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
-  --dry-run=client -o yaml > stack-configmap.yaml
-
-kubectl apply --validate=false -f stack-configmap.yaml
-
-# Deploy our custom llama stack server (overriding the base one)
-envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -1,267 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Simple benchmark script for Llama Stack with OpenAI API compatibility.
-"""
-
-import argparse
-import asyncio
-import os
-import random
-import statistics
-import time
-from typing import Tuple
-import aiohttp
-
-
-class BenchmarkStats:
-    def __init__(self):
-        self.response_times = []
-        self.ttft_times = []
-        self.chunks_received = []
-        self.errors = []
-        self.success_count = 0
-        self.total_requests = 0
-        self.concurrent_users = 0
-        self.start_time = None
-        self.end_time = None
-        self._lock = asyncio.Lock()
-
-    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
-        async with self._lock:
-            self.total_requests += 1
-            if error:
-                self.errors.append(error)
-            else:
-                self.success_count += 1
-                self.response_times.append(response_time)
-                self.chunks_received.append(chunks)
-                if ttft is not None:
-                    self.ttft_times.append(ttft)
-
-    def print_summary(self):
-        if not self.response_times:
-            print("No successful requests to report")
-            if self.errors:
-                print(f"Total errors: {len(self.errors)}")
-                print("First 5 errors:")
-                for error in self.errors[:5]:
-                    print(f"  {error}")
-            return
-
-        total_time = self.end_time - self.start_time
-        success_rate = (self.success_count / self.total_requests) * 100
-        
-        print(f"\n{'='*60}")
-        print(f"BENCHMARK RESULTS")
-        print(f"{'='*60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
-        
-        print(f"\nResponse Time Statistics:")
-        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
-        print(f"  Median: {statistics.median(self.response_times):.3f}s")
-        print(f"  Min: {min(self.response_times):.3f}s")
-        print(f"  Max: {max(self.response_times):.3f}s")
-        
-        if len(self.response_times) > 1:
-            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
-            
-        percentiles = [50, 90, 95, 99]
-        sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
-        for p in percentiles:
-            idx = int(len(sorted_times) * p / 100) - 1
-            idx = max(0, min(idx, len(sorted_times) - 1))
-            print(f"  P{p}: {sorted_times[idx]:.3f}s")
-            
-        if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
-            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
-            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
-            print(f"  Min: {min(self.ttft_times):.3f}s")
-            print(f"  Max: {max(self.ttft_times):.3f}s")
-            
-            if len(self.ttft_times) > 1:
-                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
-                
-            sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
-            for p in percentiles:
-                idx = int(len(sorted_ttft) * p / 100) - 1
-                idx = max(0, min(idx, len(sorted_ttft) - 1))
-                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
-            
-        if self.chunks_received:
-            print(f"\nStreaming Statistics:")
-            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
-            print(f"  Total chunks received: {sum(self.chunks_received)}")
-        
-        if self.errors:
-            print(f"\nErrors (showing first 5):")
-            for error in self.errors[:5]:
-                print(f"  {error}")
-
-
-class LlamaStackBenchmark:
-    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
-        self.model_id = model_id
-        self.headers = {"Content-Type": "application/json"}
-        self.test_messages = [
-            [{"role": "user", "content": "Hi"}],
-            [{"role": "user", "content": "What is the capital of France?"}],
-            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
-            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
-            [
-                {"role": "user", "content": "What is machine learning?"},
-                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
-            ]
-        ]
-
-
-    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
-        """Make a single async streaming chat completion request."""
-        messages = random.choice(self.test_messages)
-        payload = {
-            "model": self.model_id,
-            "messages": messages,
-            "stream": True,
-            "max_tokens": 100
-        }
-        
-        start_time = time.time()
-        chunks_received = 0
-        ttft = None
-        error = None
-        
-        session = aiohttp.ClientSession()
-        
-        try:
-            async with session.post(
-                f"{self.base_url}/chat/completions",
-                headers=self.headers,
-                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
-            ) as response:
-                if response.status == 200:
-                    async for line in response.content:
-                        if line:
-                            line_str = line.decode('utf-8').strip()
-                            if line_str.startswith('data: '):
-                                chunks_received += 1
-                                if ttft is None:
-                                    ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
-                                    break
-                    
-                    if chunks_received == 0:
-                        error = "No streaming chunks received"
-                else:
-                    text = await response.text()
-                    error = f"HTTP {response.status}: {text[:100]}"
-                    
-        except Exception as e:
-            error = f"Request error: {str(e)}"
-        finally:
-            await session.close()
-            
-        response_time = time.time() - start_time
-        return response_time, chunks_received, ttft, error
-
-
-    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
-        """Run benchmark using async requests for specified duration."""
-        stats = BenchmarkStats()
-        stats.concurrent_users = concurrent_users
-        stats.start_time = time.time()
-        
-        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
-        print(f"Target URL: {self.base_url}/chat/completions")
-        print(f"Model: {self.model_id}")
-        
-        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
-            
-            async def worker(worker_id: int):
-                """Worker that sends requests sequentially until canceled."""
-                request_count = 0
-                while True:
-                    try:
-                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
-                        await stats.add_result(response_time, chunks, ttft, error)
-                        request_count += 1
-                        
-                    except asyncio.CancelledError:
-                        break
-                    except Exception as e:
-                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
-            
-            # Progress reporting task
-            async def progress_reporter():
-                last_report_time = time.time()
-                while True:
-                    try:
-                        await asyncio.sleep(1)  # Report every second
-                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
-                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
-                            last_report_time = time.time()
-                    except asyncio.CancelledError:
-                        break
-            
-            # Spawn concurrent workers
-            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
-            progress_task = asyncio.create_task(progress_reporter())
-            tasks.append(progress_task)
-            
-            # Wait for duration then cancel all tasks
-            await asyncio.sleep(duration)
-            
-            for task in tasks:
-                task.cancel()
-            
-            # Wait for all tasks to complete
-            await asyncio.gather(*tasks, return_exceptions=True)
-        
-        stats.end_time = time.time()
-        return stats
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
-                       help="Model ID to use for requests")
-    parser.add_argument("--duration", type=int, default=60,
-                       help="Duration in seconds to run benchmark (default: 60)")
-    parser.add_argument("--concurrent", type=int, default=10,
-                       help="Number of concurrent users (default: 10)")
-    
-    args = parser.parse_args()
-    
-    benchmark = LlamaStackBenchmark(args.base_url, args.model)
-    
-    try:
-        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
-        stats.print_summary()
-        
-    except KeyboardInterrupt:
-        print("\nBenchmark interrupted by user")
-    except Exception as e:
-        print(f"Benchmark failed: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-OpenAI-compatible mock server that returns:
- Hardcoded /models response for consistent validation
- Valid OpenAI-formatted chat completion responses with dynamic content
-"""
-
-from flask import Flask, request, jsonify, Response
-import time
-import random
-import uuid
-import json
-import argparse
-import os
-
-app = Flask(__name__)
-
-# Models from environment variables
-def get_models():
-    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
-    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
-    
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": model_id,
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "vllm"
-            }
-            for model_id in model_ids
-        ]
-    }
-
-def generate_random_text(length=50):
-    """Generate random but coherent text for responses."""
-    words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
-    ]
-    return " ".join(random.choices(words, k=length))
-
-@app.route('/v1/models', methods=['GET'])
-def list_models():
-    models = get_models()
-    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
-    return jsonify(models)
-
-@app.route('/v1/chat/completions', methods=['POST'])
-def chat_completions():
-    """Return OpenAI-formatted chat completion responses."""
-    data = request.get_json()
-    default_model = get_models()['data'][0]['id']
-    model = data.get('model', default_model)
-    messages = data.get('messages', [])
-    stream = data.get('stream', False)
-     
-    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
-    
-    if stream:
-        return handle_streaming_completion(model, messages)
-    else:
-        return handle_non_streaming_completion(model, messages)
-
-def handle_non_streaming_completion(model, messages):
-    response_text = generate_random_text(random.randint(20, 80))
-    
-    # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
-    completion_tokens = len(response_text.split())
-    
-    response = {
-        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        "object": "chat.completion",
-        "created": int(time.time()),
-        "model": model,
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": response_text
-                },
-                "finish_reason": "stop"
-            }
-        ],
-        "usage": {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
-        }
-    }
-    
-    return jsonify(response)
-
-def handle_streaming_completion(model, messages):
-    def generate_stream():
-        # Generate response text
-        full_response = generate_random_text(random.randint(30, 100))
-        words = full_response.split()
-        
-        # Send initial chunk
-        initial_chunk = {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            "object": "chat.completion.chunk",
-            "created": int(time.time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"role": "assistant", "content": ""}
-                }
-            ]
-        }
-        yield f"data: {json.dumps(initial_chunk)}\n\n"
-        
-        # Send word by word
-        for i, word in enumerate(words):
-            chunk = {
-                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-                "object": "chat.completion.chunk", 
-                "created": int(time.time()),
-                "model": model,
-                "choices": [
-                    {
-                        "index": 0,
-                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
-                    }
-                ]
-            }
-            yield f"data: {json.dumps(chunk)}\n\n"
-            # Configurable delay to simulate realistic streaming
-            stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
-            time.sleep(stream_delay)
-        
-        # Send final chunk
-        final_chunk = {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            "object": "chat.completion.chunk",
-            "created": int(time.time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"content": ""},
-                    "finish_reason": "stop"
-                }
-            ]
-        }
-        yield f"data: {json.dumps(final_chunk)}\n\n"
-        yield "data: [DONE]\n\n"
-    
-    return Response(
-        generate_stream(),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-            'Access-Control-Allow-Origin': '*',
-        }
-    )
-
-@app.route('/health', methods=['GET'])
-def health():
-    return jsonify({"status": "healthy", "type": "openai-mock"})
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
-    parser.add_argument('--port', type=int, default=8081, 
-                       help='Port to run the server on (default: 8081)')
-    args = parser.parse_args()
-    
-    port = args.port
-    
-    models = get_models()
-    print("Starting OpenAI-compatible mock server...")
-    print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
-    print("- OpenAI-formatted chat/completion responses with dynamic content")
-    print("- Streaming support with valid SSE format")
-    print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Script to profile an already running Llama Stack server
-# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
-
-DURATION=${1:-60}  # Default 60 seconds
-OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
-
-echo "Looking for running Llama Stack server..."
-
-# Find the server PID
-SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
-
-
-if [ -z "$SERVER_PID" ]; then
-    echo "Error: No running Llama Stack server found"
-    echo "Please start your server first with:"
-    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
-    exit 1
-fi
-
-echo "Found Llama Stack server with PID: $SERVER_PID"
-
-# Start py-spy profiling
-echo "Starting py-spy profiling for ${DURATION} seconds..."
-echo "Output will be saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "You can now run your load test..."
-echo ""
-
-# Get the full path to py-spy
-PYSPY_PATH=$(which py-spy)
-
-# Check if running as root, if not, use sudo
-if [ "$EUID" -ne 0 ]; then
-    echo "py-spy requires root permissions on macOS. Running with sudo..."
-    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-else
-    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-fi
-
-echo ""
-echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "To view the flame graph:"
-echo "open ${OUTPUT_FILE}.svg"
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-set -euo pipefail
-
-# Default values
-TARGET="stack"
-DURATION=60
-CONCURRENT=10
-
-# Parse command line arguments
-usage() {
-    echo "Usage: $0 [options]"
-    echo "Options:"
-    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
-    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
-    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
-    echo "  -h, --help                    Show this help message"
-    echo ""
-    echo "Examples:"
-    echo "  $0 --target vllm              # Benchmark vLLM direct"
-    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
-    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
-}
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -t|--target)
-            TARGET="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -c|--concurrent)
-            CONCURRENT="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# Validate target
-if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
-    echo "Error: Target must be 'stack' or 'vllm'"
-    usage
-    exit 1
-fi
-
-# Set configuration based on target
-if [[ "$TARGET" == "vllm" ]]; then
-    BASE_URL="http://vllm-server:8000/v1"
-    JOB_NAME="vllm-benchmark-job"
-    echo "Benchmarking vLLM direct..."
-else
-    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
-    JOB_NAME="stack-benchmark-job"
-    echo "Benchmarking Llama Stack..."
-fi
-
-echo "Configuration:"
-echo "  Target: $TARGET"
-echo "  Base URL: $BASE_URL"
-echo "  Duration: ${DURATION}s"
-echo "  Concurrent users: $CONCURRENT"
-echo ""
-
-# Create temporary job yaml
-TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
-cat > "$TEMP_YAML" << EOF
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: $JOB_NAME
-  namespace: default
-spec:
-  template:
-    spec:
-      containers:
-      - name: benchmark
-        image: python:3.11-slim
-        command: ["/bin/bash"]
-        args:
-        - "-c"
-        - |
-          pip install aiohttp &&
-          python3 /benchmark/benchmark.py \\
-            --base-url $BASE_URL \\
-            --model \${INFERENCE_MODEL} \\
-            --duration $DURATION \\
-            --concurrent $CONCURRENT
-        env:
-        - name: INFERENCE_MODEL
-          value: "meta-llama/Llama-3.2-3B-Instruct"
-        volumeMounts:
-        - name: benchmark-script
-          mountPath: /benchmark
-        resources:
-          requests:
-            memory: "256Mi"
-            cpu: "250m"
-          limits:
-            memory: "512Mi"
-            cpu: "500m"
-      volumes:
-      - name: benchmark-script
-        configMap:
-          name: benchmark-script
-      restartPolicy: Never
-  backoffLimit: 3
-EOF
-
-echo "Creating benchmark ConfigMap..."
-kubectl create configmap benchmark-script \
-  --from-file=benchmark.py=benchmark.py \
-  --dry-run=client -o yaml | kubectl apply -f -
-
-echo "Cleaning up any existing benchmark job..."
-kubectl delete job $JOB_NAME 2>/dev/null || true
-
-echo "Deploying benchmark Job..."
-kubectl apply -f "$TEMP_YAML"
-
-echo "Waiting for job to start..."
-kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
-
-echo "Following benchmark logs..."
-kubectl logs -f job/$JOB_NAME
-
-echo "Job completed. Checking final status..."
-kubectl get job $JOB_NAME
-
-# Clean up temporary file
-rm -f "$TEMP_YAML"
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -1,133 +0,0 @@
-apiVersion: v1
-data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-benchmark-demo
-    apis:
-    - agents
-    - inference
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-          kvstore:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    metadata_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
-    inference_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-    models:
-    - metadata:
-        embedding_dimension: 384
-      model_id: all-MiniLM-L6-v2
-      provider_id: sentence-transformers
-      model_type: embedding
-    - model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - model_id: ${env.SAFETY_MODEL}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8323
-kind: ConfigMap
-metadata:
-  creationTimestamp: null
-  name: llama-stack-config
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
@ -1,83 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: llama-benchmark-pvc
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 1Gi
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama-stack-benchmark-server
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llama-stack-benchmark
-      app.kubernetes.io/component: server
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llama-stack-benchmark
-        app.kubernetes.io/component: server
-    spec:
-      containers:
-      - name: llama-stack-benchmark
-        image: llamastack/distribution-starter:latest
-        imagePullPolicy: Always # since we have specified latest instead of a version
-        env:
-        - name: ENABLE_CHROMADB
-          value: "true"
-        - name: CHROMADB_URL
-          value: http://chromadb.default.svc.cluster.local:6000
-        - name: POSTGRES_HOST
-          value: postgres-server.default.svc.cluster.local
-        - name: POSTGRES_PORT
-          value: "5432"
-        - name: INFERENCE_MODEL
-          value: "${INFERENCE_MODEL}"
-        - name: SAFETY_MODEL
-          value: "${SAFETY_MODEL}"
-        - name: TAVILY_SEARCH_API_KEY
-          value: "${TAVILY_SEARCH_API_KEY}"
-        - name: VLLM_URL
-          value: http://vllm-server.default.svc.cluster.local:8000/v1
-        - name: VLLM_MAX_TOKENS
-          value: "3072"
-        - name: VLLM_SAFETY_URL
-          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
-        - name: VLLM_TLS_VERIFY
-          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
-        ports:
-          - containerPort: 8323
-        volumeMounts:
-          - name: llama-storage
-            mountPath: /root/.llama
-          - name: llama-config
-            mountPath: /etc/config
-      volumes:
-      - name: llama-storage
-        persistentVolumeClaim:
-          claimName: llama-benchmark-pvc
-      - name: llama-config
-        configMap:
-          name: llama-stack-config
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-stack-benchmark-service
-spec:
-  selector:
-    app.kubernetes.io/name: llama-stack-benchmark
-    app.kubernetes.io/component: server
-  ports:
-  - name: http
-    port: 8323
-    targetPort: 8323
-  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -1,108 +0,0 @@
-version: '2'
-image_name: kubernetes-benchmark-demo
-apis:
- agents
- inference
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      kvstore:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-      responses_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: postgres
-  host: ${env.POSTGRES_HOST:=localhost}
-  port: ${env.POSTGRES_PORT:=5432}
-  db: ${env.POSTGRES_DB:=llamastack}
-  user: ${env.POSTGRES_USER:=llamastack}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
-  table_name: llamastack_kvstore
-inference_store:
-  type: postgres
-  host: ${env.POSTGRES_HOST:=localhost}
-  port: ${env.POSTGRES_PORT:=5432}
-  db: ${env.POSTGRES_DB:=llamastack}
-  user: ${env.POSTGRES_USER:=llamastack}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
-models:
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
- model_id: ${env.INFERENCE_MODEL}
-  provider_id: vllm-inference
-  model_type: llm
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8323
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
github-actions[bot]	73a9bddcf0	build: Bump version to 0.2.13	2025-06-27 23:55:31 +00:00
github-actions[bot]	16c04eeae7	Release candidate 0.2.13rc2	2025-06-27 23:12:35 +00:00