Merge branch 'main' into opengauss-add

2025-12-22 18:56:24 +00:00 · 2025-08-08 20:58:48 +08:00 · 2025-08-08 20:58:48 +08:00 · 39e49ab97a
commit 39e49ab97a
parent 5e9c394500 9e78f2da96
807 changed files with 79555 additions and 26772 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -4,3 +4,9 @@ omit =
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
    */llama_stack/cli/scripts/*
    */llama_stack/ui/*
    */llama_stack/distribution/ui/*
    */llama_stack/strong_typing/*
    */llama_stack/env.py
    */__init__.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
--- a/.github/ISSUE_TEMPLATE/tech-debt.yml
+++ b/.github/ISSUE_TEMPLATE/tech-debt.yml
@ -0,0 +1,30 @@
 name: 🔧 Tech Debt
 description: Something that is functional but should be improved or optimizied
 labels: ["tech-debt"]
 body:
 - type: textarea
  id: tech-debt-explanation
  attributes:
    label: 🤔 What is the technical debt you think should be addressed?
    description: >
      A clear and concise description of _what_ needs to be addressed - ensure you are describing
      constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
      or feature request.
  validations:
    required: true
 - type: textarea
  id: tech-debt-motivation
  attributes:
    label: 💡 What is the benefit of addressing this technical debt?
    description: >
      A clear and concise description of _why_ this work is needed.
  validations:
    required: true
 - type: textarea
  id: other-thoughts
  attributes:
    label: Other thoughts
    description: >
      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -0,0 +1,82 @@
 name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
  test-types:
    description: 'JSON array of test types to run'
    required: true
  stack-config:
    description: 'Stack configuration to use'
    required: true
  provider:
    description: 'Provider to use for tests'
    required: true
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
  run-vision-tests:
    description: 'Whether to run vision tests'
    required: false
    default: 'false'
 runs:
  using: 'composite'
  steps:
    - name: Check Storage and Memory Available Before Tests
      if: ${{ always() }}
      shell: bash
      run: |
        free -h
        df -h
    - name: Run Integration Tests
      shell: bash
      run: |
        ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
          --test-types '${{ inputs.test-types }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
    - name: Commit and push recordings
      if: ${{ inputs.inference-mode == 'record' }}
      shell: bash
      run: |
        echo "Checking for recording changes"
        git status --porcelain tests/integration/recordings/
        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
          echo "New recordings detected, committing and pushing"
          git add tests/integration/recordings/
          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
            git commit -m "Recordings update from CI (vision)"
          else
            git commit -m "Recordings update from CI"
          fi
          git fetch origin ${{ github.event.pull_request.head.ref }}
          git rebase origin/${{ github.event.pull_request.head.ref }}
          echo "Rebased successfully"
          git push origin HEAD:${{ github.event.pull_request.head.ref }}
          echo "Pushed successfully"
        else
          echo "No recording changes"
        fi
    - name: Write inference logs to file
      if: ${{ always() }}
      shell: bash
      run: |
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
    - name: Upload logs
      if: ${{ always() }}
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
        path: |
          *.log
        retention-days: 1
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,13 +1,23 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
  run-vision-tests:
    description: 'Run vision tests: "true" or "false"'
    required: false
    default: 'false'
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
+        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-        # TODO: rebuild an ollama image with llama-guard3:1b
+          image="ollama-with-vision-model"
        else
          image="ollama-with-models"
        fi
        echo "Starting Ollama with image: $image"
        docker run -d --name ollama -p 11434:11434 docker.io/llamastack/$image
        echo "Verifying Ollama status..."
        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
        docker exec ollama ollama pull llama-guard3:1b
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -5,6 +5,10 @@ inputs:
    description: The Python version to use
    required: false
    default: "3.12"
  client-version:
    description: The llama-stack-client-python version to test against (latest or published)
    required: false
    default: "latest"
 runs:
  using: "composite"
  steps:
@ -20,8 +24,17 @@ runs:
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
-        # always test against the latest version of the client
+
-        # TODO: this is not necessarily a good idea. we need to test against both published and latest
+        # Install llama-stack-client-python based on the client-version input
-        # to find out backwards compatibility issues.
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
-        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+          echo "Installing latest llama-stack-client-python from main branch"
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          uv pip install llama-stack-client
        else
          echo "Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
        uv pip install -e .
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -0,0 +1,51 @@
 name: 'Setup Test Environment'
 description: 'Common setup steps for integration tests including dependencies, providers, and build'
 inputs:
  python-version:
    description: 'Python version to use'
    required: true
  client-version:
    description: 'Client version (latest or published)'
    required: true
  provider:
    description: 'Provider to setup (ollama or vllm)'
    required: true
    default: 'ollama'
  run-vision-tests:
    description: 'Whether to setup provider for vision tests'
    required: false
    default: 'false'
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
 runs:
  using: 'composite'
  steps:
    - name: Install dependencies
      uses: ./.github/actions/setup-runner
      with:
        python-version: ${{ inputs.python-version }}
        client-version: ${{ inputs.client-version }}
    - name: Setup ollama
      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-ollama
      with:
        run-vision-tests: ${{ inputs.run-vision-tests }}
    - name: Setup vllm
      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-vllm
    - name: Build Llama Stack
      shell: bash
      run: |
        uv run llama stack build --template ci-tests --image-type venv
    - name: Configure git for commits
      shell: bash
      run: |
        git config --local user.email "github-actions[bot]@users.noreply.github.com"
        git config --local user.name "github-actions[bot]"
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -0,0 +1,27 @@
 name: Setup VLLM
 description: Start VLLM
 runs:
  using: "composite"
  steps:
    - name: Start VLLM
      shell: bash
      run: |
        # Start vllm container
        docker run -d \
          --name vllm \
          -p 8000:8000 \
          --privileged=true \
          quay.io/higginsd/vllm-cpu:65393ee064 \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-auto-tool-choice \
          --tool-call-parser llama3_json \
          --model /root/.cache/Llama-3.2-1B-Instruct \
          --served-model-name meta-llama/Llama-3.2-1B-Instruct
          # Wait for vllm to be ready
          echo "Waiting for vllm to be ready..."
          timeout 900 bash -c 'until curl -f http://localhost:8000/health; do
            echo "Waiting for vllm..."
            sleep 5
          done'
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -14,8 +14,6 @@ updates:
    schedule:
      interval: "weekly"
      day: "saturday"
    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
    open-pull-requests-limit: 0
    labels:
      - type/dependencies
      - python
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -0,0 +1,22 @@
 # Llama Stack CI
 Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
 | Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -1,5 +1,7 @@
 name: Update Changelog
 run-name: Creates PR for updating the CHANGELOG.md
 on:
  release:
    types: [published, unpublished, created, edited, deleted, released]
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@ -1,355 +0,0 @@
 name: "Run Llama-stack Tests"
 on:
  #### Temporarily disable PR runs until tests run as intended within mainline.
  #TODO Add this back.
  #pull_request_target:
  #  types: ["opened"]
  #  branches:
  #    - 'main'
  #  paths:
  #    - 'llama_stack/**/*.py'
  #    - 'tests/**/*.py'
  workflow_dispatch:
    inputs:
      runner:
        description: 'GHA Runner Scale Set label to run workflow on.'
        required: true
        default: "llama-stack-gha-runner-gpu"
      checkout_reference:
        description: "The branch, tag, or SHA to checkout"
        required: true
        default: "main"
      debug:
        description: 'Run debugging steps?'
        required: false
        default: "true"
      sleep_time:
        description: '[DEBUG] sleep time for debugging'
        required: true
        default: "0"
      provider_id:
        description: 'ID of your provider'
        required: true
        default: "meta_reference"
      model_id:
        description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
        required: true
        default: "llama_3b"
      model_override_3b:
        description: 'Specify shorthand model for <llama_3b> '
        required: false
        default: "Llama3.2-3B-Instruct"
      model_override_8b:
        description: 'Specify shorthand model for <llama_8b> '
        required: false
        default: "Llama3.1-8B-Instruct"
 env:
  # ID used for each test's provider config
  PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
  # Path to model checkpoints within EFS volume
  MODEL_CHECKPOINT_DIR: "/data/llama"
  # Path to directory to run tests from
  TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
  # Keep track of a list of model IDs that are valid to use within pytest fixture marks
  AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
  # Shorthand name for model ID, used in pytest fixture marks
  MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
  # Override the `llama_3b` / `llama_8b' models, else use the default.
  LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
  LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
  # Defines which directories in TESTS_PATH to exclude from the test loop
  EXCLUDED_DIRS: "__pycache__"
  # Defines the output xml reports generated after a test is run
  REPORTS_GEN: ""
 jobs:
  execute_workflow:
    name: Execute workload on Self-Hosted GPU k8s runner
    permissions:
      pull-requests: write
    defaults:
      run:
        shell: bash
    runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
    if: always()
    steps:
      ##############################
      #### INITIAL DEBUG CHECKS ####
      ##############################
      - name: "[DEBUG] Check content of the EFS mount"
        id: debug_efs_volume
        continue-on-error: true
        if: inputs.debug == 'true'
        run: |
            echo "========= Content of the EFS mount ============="
            ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
      - name: "[DEBUG] Get runner container OS information"
        id: debug_os_info
        if: ${{ inputs.debug == 'true' }}
        run: |
            cat /etc/os-release
      - name: "[DEBUG] Print environment variables"
        id: debug_env_vars
        if: ${{ inputs.debug == 'true' }}
        run: |
            echo "PROVIDER_ID = ${PROVIDER_ID}"
            echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
            echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
            echo "MODEL_ID = ${MODEL_ID}"
            echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
            echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
            echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
            echo "REPORTS_GEN = ${REPORTS_GEN}"
      ############################
      #### MODEL INPUT CHECKS ####
      ############################
      - name: "Check if env.model_id is valid"
        id: check_model_id
        run: |
          if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
            echo "Model ID '${MODEL_ID}' is valid."
          else
            echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
            exit 1
          fi
      #######################
      #### CODE CHECKOUT ####
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ inputs.branch }}
      - name: "[DEBUG] Content of the repository after checkout"
        id: debug_content_after_checkout
        if: ${{ inputs.debug == 'true' }}
        run: |
            ls -la ${GITHUB_WORKSPACE}
      ##########################################################
      ####              OPTIONAL SLEEP DEBUG                ####
      #                                                        #
      # Use to "exec" into the test k8s POD and run tests      #
      # manually to identify what dependencies are being used. #
      #                                                        #
      ##########################################################
      - name: "[DEBUG] sleep"
        id: debug_sleep
        if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
        run: |
            sleep ${{ inputs.sleep_time }}
      ############################
      #### UPDATE SYSTEM PATH ####
      ############################
      - name: "Update path: execute"
        id: path_update_exec
        run: |
          # .local/bin is needed for certain libraries installed below to be recognized
          # when calling their executable to install sub-dependencies
          mkdir -p ${HOME}/.local/bin
          echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
      #####################################
      #### UPDATE CHECKPOINT DIRECTORY ####
      #####################################
      - name: "Update checkpoint directory"
        id: checkpoint_update
        run: |
          echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
          if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
          elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
          else
            echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
            exit 1
          fi
      - name: "[DEBUG] Checkpoint update check"
        id: debug_checkpoint_update
        if: ${{ inputs.debug == 'true' }}
        run: |
          echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
      ##################################
      #### DEPENDENCY INSTALLATIONS ####
      ##################################
      - name: "Installing 'apt' required packages"
        id: install_apt
        run: |
          echo "[STEP] Installing 'apt' required packages"
          sudo apt update -y
          sudo apt install -y python3 python3-pip npm wget
      - name: "Installing packages with 'curl'"
        id: install_curl
        run: |
          curl -fsSL https://ollama.com/install.sh | sh
      - name: "Installing packages with 'wget'"
        id: install_wget
        run: |
          wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
          chmod +x Miniconda3-latest-Linux-x86_64.sh
          ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
          # Add miniconda3 bin to system path
          echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
      - name: "Installing packages with 'npm'"
        id: install_npm_generic
        run: |
          sudo npm install -g junit-merge
      - name: "Installing pip dependencies"
        id: install_pip_generic
        run: |
          echo "[STEP] Installing 'llama-stack' models"
          pip install -U pip setuptools
          pip install -r requirements.txt
          pip install -e .
          pip install -U \
            torch torchvision \
            pytest pytest_asyncio \
            fairscale lm-format-enforcer \
            zmq chardet pypdf \
            pandas sentence_transformers together \
            aiosqlite
      - name: "Installing packages with conda"
        id: install_conda_generic
        run: |
          conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
      #############################################################
      #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
      #############################################################
      - name: "Run Tests: Loop"
        id: run_tests_loop
        working-directory: "${{ github.workspace }}"
        run: |
          pattern=""
          for dir in llama_stack/providers/tests/*; do
            if [ -d "$dir" ]; then
              dir_name=$(basename "$dir")
              if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
                for file in "$dir"/test_*.py; do
                  test_name=$(basename "$file")
                  new_file="result-${dir_name}-${test_name}.xml"
                  if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
                     --junitxml="${{ github.workspace }}/${new_file}"; then
                    echo "Ran test: ${test_name}"
                  else
                    echo "Did NOT run test: ${test_name}"
                  fi
                  pattern+="${new_file} "
                done
              fi
            fi
          done
          echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
      - name: "Test Summary: Merge"
        id: test_summary_merge
        working-directory: "${{ github.workspace }}"
        run: |
          echo "Merging the following test result files: ${REPORTS_GEN}"
          # Defaults to merging them into 'merged-test-results.xml'
          junit-merge ${{ env.REPORTS_GEN }}
      ############################################
      #### AUTOMATIC TESTING ON PULL REQUESTS ####
      ############################################
      #### Run tests ####
      - name: "PR - Run Tests"
        id: pr_run_tests
        working-directory: "${{ github.workspace }}"
        if: github.event_name == 'pull_request_target'
        run: |
          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
          # (Optional) Add more tests here.
          # Merge test results with 'merged-test-results.xml' from above.
          # junit-merge <new-test-results> merged-test-results.xml
      #### Create test summary ####
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: test-summary
          path: test-summary.md
      #### Update PR request ####
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
        with:
          filePath: test-summary.md
      ########################
      #### MANUAL TESTING ####
      ########################
      #### Run tests ####
      - name: "Manual - Run Tests: Prep"
        id: manual_run_tests
        working-directory: "${{ github.workspace }}"
        if: github.event_name == 'workflow_dispatch'
        run: |
          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
          #TODO Use this when collection errors are resolved
          # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
          # (Optional) Add more tests here.
          # Merge test results with 'merged-test-results.xml' from above.
          # junit-merge <new-test-results> merged-test-results.xml
      #### Create test summary ####
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -1,5 +1,7 @@
 name: Installer CI
 run-name: Test the installation script
 on:
  pull_request:
    paths:
@ -17,10 +19,20 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck scripts/install.sh
-  smoke-test:
+  smoke-test-on-dev:
    needs: lint
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
      - name: Run installer end-to-end
-        run: ./scripts/install.sh
+        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          ./scripts/install.sh --image $IMAGE_ID
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -1,5 +1,7 @@
 name: Integration Auth Tests
 run-name: Run the integration test suite with Kubernetes authentication
 on:
  push:
    branches: [ main ]
@ -35,7 +37,7 @@ jobs:
      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
+        uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20
      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -1,5 +1,7 @@
 name: SqlStore Integration Tests
 run-name: Run the integration test suite with SqlStore
 on:
  push:
    branches: [ main ]
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,114 +1,96 @@
-name: Integration Tests
+name: Integration Tests (Replay)
 run-name: Run the integration test suite from tests/integration in replay mode
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
-      - 'tests/integration/**'
+      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
        description: 'Test against both the latest and published versions'
        type: boolean
        default: false
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  # Skip concurrency for pushes to main - each commit should be tested independently
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
  discover-tests:
    runs-on: ubuntu-latest
    outputs:
-      test-type: ${{ steps.generate-matrix.outputs.test-type }}
+      test-types: ${{ steps.generate-test-types.outputs.test-types }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Generate test matrix
+      - name: Generate test types
-        id: generate-matrix
+        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          # NOTE: we are excluding post_training since the tests take too long
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases)$" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
+          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
-  test-matrix:
+  run-replay-mode-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
    strategy:
      fail-fast: false
      matrix:
        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
        client-type: [library, server]
-        python-version: ["3.12", "3.13"]
+        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
        run-vision-tests: [true, false]
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Install dependencies
+      - name: Setup test environment
-        uses: ./.github/actions/setup-runner
+        uses: ./.github/actions/setup-test-environment
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
          provider: ${{ matrix.provider }}
          run-vision-tests: ${{ matrix.run-vision-tests }}
          inference-mode: 'replay'
-      - name: Setup ollama
+      - name: Run tests
-        uses: ./.github/actions/setup-ollama
+        uses: ./.github/actions/run-and-record-tests
      - name: Build Llama Stack
        run: |
          uv run llama stack build --template starter --image-type venv
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Run Integration Tests
        env:
          OLLAMA_INFERENCE_MODEL: "llama3.2:3b-instruct-fp16" # for server tests
          ENABLE_OLLAMA: "ollama" # for server tests
          OLLAMA_URL: "http://0.0.0.0:11434"
          SAFETY_MODEL: "llama-guard3:1b"
          LLAMA_STACK_CLIENT_TIMEOUT: "300" # Increased timeout for eval operations
        # Use 'shell' to get pipefail behavior
        # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
        # TODO: write a precommit hook to detect if a test contains a pipe but does not use 'shell: bash'
        shell: bash
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="starter"
          else
            stack_config="server:starter"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="ollama/llama3.2:3b-instruct-fp16" \
            --embedding-model=all-MiniLM-L6-v2 \
            --safety-shield=ollama \
            --color=yes \
            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
          sudo docker logs ollama > ollama.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
-          path: |
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-            *.log
+          provider: ${{ matrix.provider }}
-          retention-days: 1
+          inference-mode: 'replay'
          run-vision-tests: ${{ matrix.run-vision-tests }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -1,5 +1,7 @@
 name: Vector IO Integration Tests
 run-name: Run the integration test suite with various VectorIO providers
 on:
  push:
    branches: [ main ]
@ -22,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector"]
+        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
        python-version: ["3.12", "3.13"]
      fail-fast: false # we want to run all tests regardless of failure
@ -46,6 +48,14 @@ jobs:
            -e ANONYMIZED_TELEMETRY=FALSE \
            chromadb/chroma:latest
      - name: Setup Weaviate
        if: matrix.vector-io-provider == 'remote::weaviate'
        run: |
          docker run --rm -d --pull always \
          --name weaviate \
          -p 8080:8080 -p 50051:50051 \
          cr.weaviate.io/semitechnologies/weaviate:1.32.0
      - name: Start PGVector DB
        if: matrix.vector-io-provider == 'remote::pgvector'
        run: |
@ -76,6 +86,29 @@ jobs:
          PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
            -c "CREATE EXTENSION IF NOT EXISTS vector;"
      - name: Setup Qdrant
        if: matrix.vector-io-provider == 'remote::qdrant'
        run: |
          docker run --rm -d --pull always \
            --name qdrant \
            -p 6333:6333 \
            qdrant/qdrant
      - name: Wait for Qdrant to be ready
        if: matrix.vector-io-provider == 'remote::qdrant'
        run: |
          echo "Waiting for Qdrant to be ready..."
          for i in {1..30}; do
            if curl -s http://localhost:6333/collections | grep -q '"status":"ok"'; then
              echo "Qdrant is ready!"
              exit 0
            fi
            sleep 2
          done
          echo "Qdrant failed to start"
          docker logs qdrant
          exit 1
      - name: Wait for ChromaDB to be ready
        if: matrix.vector-io-provider == 'remote::chromadb'
        run: |
@ -91,9 +124,24 @@ jobs:
          docker logs chromadb
          exit 1
      - name: Wait for Weaviate to be ready
        if: matrix.vector-io-provider == 'remote::weaviate'
        run: |
          echo "Waiting for Weaviate to be ready..."
          for i in {1..30}; do
            if curl -s http://localhost:8080 | grep -q "https://weaviate.io/developers/weaviate/current/"; then
              echo "Weaviate is ready!"
              exit 0
            fi
            sleep 2
          done
          echo "Weaviate failed to start"
          docker logs weaviate
          exit 1
      - name: Build Llama Stack
        run: |
-          uv run llama stack build --template starter --image-type venv
+          uv run llama stack build --template ci-tests --image-type venv
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -111,10 +159,14 @@ jobs:
          PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          ENABLE_QDRANT: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'true' || '' }}
          QDRANT_URL: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'http://localhost:6333' || '' }}
          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
          uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
-            --embedding-model all-MiniLM-L6-v2
+            --embedding-model sentence-transformers/all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
@ -132,6 +184,11 @@ jobs:
        run: |
          docker logs chromadb > chromadb.log
      - name: Write Qdrant logs to file
        if: ${{ always() && matrix.vector-io-provider == 'remote::qdrant' }}
        run: |
          docker logs qdrant > qdrant.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,5 +1,7 @@
 name: Pre-commit
 run-name: Run pre-commit checks
 on:
  pull_request:
  push:
@ -12,10 +14,18 @@ concurrency:
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - name: Checkout code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
          # Fetch full history for dependabot PRs to allow commits
          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@ -27,15 +37,45 @@ jobs:
            .pre-commit-config.yaml
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Debug
        run: |
          echo "github.ref: ${{ github.ref }}"
          echo "github.actor: ${{ github.actor }}"
      - name: Commit changes for dependabot PRs
        if: github.actor == 'dependabot[bot]'
        run: |
          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
            git config --local user.email "github-actions[bot]@users.noreply.github.com"
            git config --local user.name "github-actions[bot]"
            # Ensure we're on the correct branch
            git checkout -B ${{ github.head_ref }}
            git add -A
            git commit -m "Apply pre-commit fixes"
            # Pull latest changes from the PR branch and rebase our commit on top
            git pull --rebase origin ${{ github.head_ref }}
            # Push to the PR branch
            git push origin ${{ github.head_ref }}
            echo "Pre-commit fixes committed and pushed"
          else
            echo "No changes to commit"
          fi
      - name: Verify if there are any diff files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -1,5 +1,7 @@
 name: Test Llama Stack Build
 run-name: Test llama stack build
 on:
  push:
    branches:
@ -7,20 +9,20 @@ on:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/core/build.*'
-      - 'llama_stack/distribution/*.sh'
+      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/templates/**'
+      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/core/build.*'
-      - 'llama_stack/distribution/*.sh'
+      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/templates/**'
+      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
 concurrency:
@ -31,23 +33,23 @@ jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
-      templates: ${{ steps.set-matrix.outputs.templates }}
+      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Generate Template List
+      - name: Generate Distribution List
        id: set-matrix
        run: |
-          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
@ -60,13 +62,13 @@ jobs:
      - name: Print build dependencies
        run: |
-          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -97,16 +99,16 @@ jobs:
      - name: Build a single provider
        run: |
-          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
+          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
-          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
+          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -120,27 +122,27 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Pin template to UBI9 base
+      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/templates/starter/build.yaml
+          ' llama_stack/distributions/ci-tests/build.yaml
      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/templates/starter/build.yaml
+          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -1,5 +1,7 @@
 name: Python Package Build Test
 run-name: Test building the llama-stack PyPI project
 on:
  push:
    branches:
@ -20,7 +22,7 @@ jobs:
      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Install uv
-      uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1
+      uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -0,0 +1,109 @@
 name: Integration Tests (Record)
 run-name: Run the integration test suite from tests/integration
 on:
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, labeled]
    paths:
      - 'llama_stack/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/record-integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  discover-tests:
    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
          echo "labels=$labels"
          modes_array=()
          if [[ $labels == *"re-record-vision-tests"* ]]; then
            modes_array+=("vision")
          fi
          if [[ $labels == *"re-record-tests"* ]]; then
            modes_array+=("non-vision")
          fi
          # Convert to JSON array
          if [ ${#modes_array[@]} -eq 0 ]; then
            matrix_modes="[]"
          else
            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
          fi
          echo "matrix_modes=$matrix_modes"
          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
  record-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    permissions:
      contents: write
    strategy:
      fail-fast: false
      matrix:
        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-types: ${{ needs.discover-tests.outputs.test-types }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -1,5 +1,7 @@
 name: Check semantic PR titles
 run-name: Ensure that PR titles follow the conventional commit spec
 on:
  pull_request_target:
    types:
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -1,5 +1,7 @@
 name: Close stale issues and PRs
 run-name: Run the Stale Bot action
 on:
  schedule:
    - cron: '0 0 * * *' # every day at midnight
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -1,4 +1,6 @@
-name: Test External Providers
+name: Test External Providers Installed via Module
 run-name: Test External Provider installation via Python module
 on:
  push:
@ -10,11 +12,13 @@ on:
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
-      - 'requirements.txt'
+      - 'tests/external/*'
-      - '.github/workflows/test-external-providers.yml' # This workflow
+      - '.github/workflows/test-external-provider-module.yml' # This workflow
 jobs:
-  test-external-providers:
+  test-external-providers-from-module:
    # This workflow is disabled. See https://github.com/meta-llama/llama-stack/pull/2975#issuecomment-3138702984 for details
    if: false
    runs-on: ubuntu-latest
    strategy:
      matrix:
@ -28,39 +32,39 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Install Ramalama
        shell: bash
        run: |
          uv pip install ramalama
      - name: Run Ramalama
        shell: bash
        run: |
          nohup ramalama serve llama3.2:3b-instruct-fp16  > ramalama_server.log 2>&1 &
      - name: Apply image type to config file
        run: |
-          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
-          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+          cat tests/external/ramalama-stack/run.yaml
      - name: Setup directory for Ollama custom provider
        run: |
          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
      - name: Create provider configuration
        run: |
          mkdir -p /home/runner/.llama/providers.d/remote/inference
          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          INFERENCE_MODEL: "llama3.2:3b-instruct-fp16"
          LLAMA_STACK_LOG_FILE: "server.log"
        run: |
          # Use the virtual environment created by the build step (name comes from build config)
-          source ci-test/bin/activate
+          source ramalama-stack-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          for i in {1..30}; do
-            if ! grep -q "Successfully loaded external provider remote::custom_ollama" server.log; then
+            if ! grep -q "successfully connected to Ramalama" server.log; then
              echo "Waiting for Llama Stack server to load the provider..."
              sleep 1
            else
@ -71,3 +75,12 @@ jobs:
          echo "Provider failed to load"
          cat server.log
          exit 1
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-provider-module-test
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -0,0 +1,88 @@
 name: Test External API and Providers
 run-name: Test the External API and Provider mechanisms
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - 'tests/external/*'
      - '.github/workflows/test-external.yml' # This workflow
 jobs:
  test-external:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        image-type: [venv]
        # We don't do container yet, it's tricky to install a package from the host into the
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Create API configuration
        run: |
          mkdir -p /home/runner/.llama/apis.d
          cp tests/external/weather.yaml /home/runner/.llama/apis.d/weather.yaml
      - name: Create provider configuration
        run: |
          mkdir -p /home/runner/.llama/providers.d/remote/weather
          cp tests/external/kaze.yaml /home/runner/.llama/providers.d/remote/weather/kaze.yaml
      - name: Print distro dependencies
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
      - name: Build distro from config file
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
          LLAMA_STACK_LOG_FILE: "server.log"
        run: |
          # Use the virtual environment created by the build step (name comes from build config)
          source ci-test/bin/activate
          uv pip list
          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -sSf http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              exit 0
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Test external API
        run: |
          curl -sSf http://localhost:8321/v1/weather/locations
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-test
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -1,69 +0,0 @@
 name: auto-tests
 on:
  # pull_request:
  workflow_dispatch:
    inputs:
      commit_sha:
        description: 'Specific Commit SHA to trigger on'
        required: false
        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
 jobs:
  test-llama-stack-as-library:
    runs-on: ubuntu-latest
    env:
      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
    strategy:
      matrix:
        provider: [fireworks, together]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.inputs.commit_sha }}
      - name: Echo commit SHA
        run: |
          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
          git rev-parse HEAD
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt pytest
          pip install -e .
      - name: Build providers
        run: |
          llama stack build --template ${{ matrix.provider }} --image-type venv
      - name: Install the latest llama-stack-client & llama-models packages
        run: |
          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
      - name: Run client-sdk test
        working-directory: "${{ github.workspace }}"
        env:
          REPORT_OUTPUT: md_report.md
        shell: bash
        run: |
          pip install --upgrade pytest-md-report
          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
      - name: Output reports to the job summary
        if: always()
        shell: bash
        run: |
          if [ -f "$REPORT_FILE" ]; then
            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "</details>" >> $GITHUB_STEP_SUMMARY
          fi
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -1,5 +1,7 @@
 name: Unit Tests
 run-name: Run the unit test suite
 on:
  push:
    branches: [ main ]
@ -33,10 +35,12 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python }}
      - name: Run unit tests
        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml
      - name: Upload test results
        if: always()
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,5 +1,7 @@
 name: Update ReadTheDocs
 run-name: Update the Llama Stack ReadTheDocs site
 on:
  workflow_dispatch:
    inputs:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -19,7 +19,6 @@ repos:
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
@ -56,14 +55,6 @@ repos:
    rev: 0.7.20
    hooks:
    -   id: uv-lock
    -   id: uv-export
        args: [
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.16.1
@ -129,6 +120,31 @@ repos:
        require_serial: true
        always_run: true
        files: ^llama_stack/.*$
      - id: forbid-pytest-asyncio
        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
        entry: bash
        language: system
        types: [python]
        pass_filenames: true
        args:
          - -c
          - |
            grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
              echo;
              echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
              echo "   pytest is already configured with async-mode=auto."
              echo;
              exit 1;
            } || true
      - id: generate-ci-docs
        name: Generate CI documentation
        additional_dependencies:
          - uv==0.7.8
        entry: uv run ./scripts/gen-ci-docs.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^.github/workflows/.*$
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,34 @@
 # Changelog
 # v0.2.15
 Published on: 2025-07-16T03:30:01Z
 ---
 # v0.2.14
 Published on: 2025-07-04T16:06:48Z
 ## Highlights
 * Support for Llama Guard 4
 * Added Milvus  support to vector-stores API
 * Documentation and zero-to-hero updates for latest APIs
 ---
 # v0.2.13
 Published on: 2025-06-28T04:28:11Z
 ## Highlights
 * search_mode support in OpenAI vector store API
 * Security fixes
 ---
 # v0.2.12
 Published on: 2025-06-20T22:52:12Z
@ -422,7 +451,7 @@ GenAI application developers need more than just an LLM - they need to integrate
 Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
+With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
 ## Release
 After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
@ -485,23 +514,3 @@ A small but important bug-fix release to update the URL datatype for the client-
 ---
 # v0.0.62
 Published on: 2024-12-18T02:39:43Z
 ---
 # v0.0.61
 Published on: 2024-12-10T20:50:33Z
 ---
 # v0.0.55
 Published on: 2024-11-23T17:14:07Z
 ---
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -10,8 +10,13 @@ If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stac
 **I'd like to contribute!**
-All issues are actionable (please report if they are not.) Pick one and start working on it. Thank you.
+If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
-If you need help or guidance, comment on the issue. Issues that are extra friendly to new contributors are tagged with "contributor friendly".
+leave a comment on the issue and a triager will assign it to you.
 Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
 - Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
 - Before taking an issue, check if it’s already assigned or being actively discussed.
 - If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
 **I have a bug!**
@ -41,6 +46,15 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
 6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
 7. Ensure your pull request follows the [coding style](#coding-style).
 Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
 > [!TIP]
 > As a general guideline:
 > - Experienced contributors should try to keep no more than 5 open PRs at a time.
 > - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
@ -112,7 +126,7 @@ uv run pre-commit run --all-files
 ## Running tests
-You can find the Llama Stack testing documentation here [here](tests/README.md).
+You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
 ## Adding a new dependency to the project
@ -140,7 +154,10 @@ uv sync
 * Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
  readability reasons.
 * Providers configuration class should be Pydantic Field class. It should have a `description` field
-  that describes the configuration. These descriptions will be used to generate the provider documentation.
+  that describes the configuration. These descriptions will be used to generate the provider
  documentation.
 * When possible, use keyword arguments only when calling functions.
 * Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
 ## Common Tasks
@ -148,7 +165,7 @@ Some tips about common tasks you work on while contributing to Llama Stack:
 ### Using `llama stack build`
-Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
 Example:
 ```bash
@ -156,7 +173,7 @@ cd work/
 git clone https://github.com/meta-llama/llama-stack.git
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```
 ### Updating distribution configurations
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,9 @@
 include pyproject.toml
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/distribution/*.sh
+include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
-include llama_stack/templates/*/*.yaml
+include llama_stack/distributions/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -111,29 +111,33 @@ Here is a list of the various API providers and available distributions that can
 Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
 | API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
-|:-------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
+|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
-| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| SambaNova | Hosted | | ✅ | | ✅ | | | | |
+|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
-| Cerebras | Hosted | | ✅ | | | | | | |
+|       Cerebras       | Hosted | | ✅ | | | | | | |
-| Fireworks | Hosted | ✅ | ✅ | ✅ | | | | | |
+|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
-| AWS Bedrock | Hosted | | ✅ | | ✅ | | | | |
+|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
-| Together | Hosted | ✅ | ✅ | | ✅ | | | | |
+|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
-| Groq | Hosted | | ✅ | | | | | | |
+|         Groq         | Hosted | | ✅ | | | | | | |
-| Ollama | Single Node | | ✅ | | | | | | |
+|        Ollama        | Single Node | | ✅ | | | | | | |
-| TGI | Hosted/Single Node | | ✅ | | | | | | |
+|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
-| NVIDIA NIM | Hosted/Single Node | | ✅ | | ✅ | | | | |
+|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
-| ChromaDB | Hosted/Single Node | | | ✅ | | | | | |
+|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
-| PG Vector | Single Node | | | ✅ | | | | | |
+|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
-| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | | |
+|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
-| vLLM | Single Node | | ✅ | | | | | | |
+|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
-| OpenAI | Hosted | | ✅ | | | | | | |
+|      SQLite-vec      | Single Node | | | ✅ | | | | | |
-| Anthropic | Hosted | | ✅ | | | | | | |
+|      PG Vector       | Single Node | | | ✅ | | | | | |
-| Gemini | Hosted | | ✅ | | | | | | |
+|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
-| WatsonX | Hosted | | ✅ | | | | | | |
+|         vLLM         | Single Node | | ✅ | | | | | | |
-| HuggingFace | Single Node | | | | | | ✅ | | ✅ |
+|        OpenAI        | Hosted | | ✅ | | | | | | |
-| TorchTune | Single Node | | | | | | ✅ | | |
+|      Anthropic       | Hosted | | ✅ | | | | | | |
-| NVIDIA NEMO | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
+|        Gemini        | Hosted | | ✅ | | | | | | |
-| NVIDIA | Hosted | | | | | | ✅ | ✅ | ✅ |
+|       WatsonX        | Hosted | | ✅ | | | | | | |
 |     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
 |      TorchTune       | Single Node | | | | | | ✅ | | |
 |     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
 |        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
 > **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.
--- a/coverage.svg
+++ b/coverage.svg
@ -0,0 +1,21 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
    <linearGradient id="b" x2="0" y2="100%">
        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
        <stop offset="1" stop-opacity=".1"/>
    </linearGradient>
    <mask id="a">
        <rect width="99" height="20" rx="3" fill="#fff"/>
    </mask>
    <g mask="url(#a)">
        <path fill="#555" d="M0 0h63v20H0z"/>
        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
        <path fill="url(#b)" d="M0 0h99v20H0z"/>
    </g>
    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
        <text x="31.5" y="14">coverage</text>
        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
        <text x="80" y="14">44%</text>
    </g>
 </svg>
--- a/docs/README.md
+++ b/docs/README.md
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -123,7 +123,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
-        "!uv run --with llama-stack llama stack build --template together --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --distro together --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -165,7 +165,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -233,7 +233,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server \n",
-        "!uv run --with llama-stack llama stack build --template meta-reference-gpu --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -275,7 +275,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@ -223,7 +223,7 @@
          "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
          "\n",
          "# this command installs all the dependencies needed for the llama stack server \n",
-          "!uv run --with llama-stack llama stack build --template llama_api --image-type venv \n",
+          "!uv run --with llama-stack llama stack build --distro llama_api --image-type venv \n",
          "\n",
          "def run_llama_stack_server_background():\n",
          "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -265,7 +265,7 @@
          "# use this helper if needed to kill the server \n",
          "def kill_llama_stack_server():\n",
          "    # Kill any existing llama stack server processes\n",
-          "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+          "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
        ]
      },
      {
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -37,7 +37,7 @@
        "\n",
        "To learn more about torchtune: https://github.com/pytorch/torchtune\n",
        "\n",
-        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/templates/experimental-post-training) as the distribution template\n",
+        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/experimental-post-training) as the distribution template\n",
        "\n",
        "####  0.0. Prerequisite: Have an OpenAI API key\n",
        "In this showcase, we will use [braintrust](https://www.braintrust.dev/) as scoring provider for eval and it uses OpenAI model as judge model for scoring. So, you need to get an API key from [OpenAI developer platform](https://platform.openai.com/docs/overview).\n",
@ -2864,7 +2864,7 @@
        }
      ],
      "source": [
-        "!llama stack build --template experimental-post-training --image-type venv --image-name __system__"
+        "!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
      ]
    },
    {
@ -3216,19 +3216,19 @@
            "INFO:datasets:Duckdb version 1.1.3 available.\n",
            "INFO:datasets:TensorFlow version 2.18.0 available.\n",
            "INFO:datasets:JAX version 0.4.33 available.\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::equality served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::equality served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::subset_of served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::subset_of served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:\n"
+            "INFO:llama_stack.core.stack:\n"
          ]
        },
        {
@ -3448,7 +3448,7 @@
        "\n",
        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
        "\n",
-        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
        "client = LlamaStackAsLibraryClient(\"experimental-post-training\")\n",
        "_ = client.initialize()"
      ]
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -38,7 +38,7 @@
   "source": [
    "# NBVAL_SKIP\n",
    "!pip install -U llama-stack\n",
-    "!UV_SYSTEM_PYTHON=1 llama stack build --template fireworks --image-type venv"
+    "!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
   ]
  },
  {
@ -48,7 +48,7 @@
   "outputs": [],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -57,7 +57,7 @@
      "outputs": [],
      "source": [
        "# NBVAL_SKIP\n",
-        "!UV_SYSTEM_PYTHON=1 llama stack build --template together --image-type venv"
+        "!UV_SYSTEM_PYTHON=1 llama stack build --distro together --image-type venv"
      ]
    },
    {
@ -661,7 +661,7 @@
        "except ImportError:\n",
        "    print(\"Not in Google Colab environment\")\n",
        "\n",
-        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
        "\n",
        "client = LlamaStackAsLibraryClient(\"together\")\n",
        "_ = client.initialize()"
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -35,7 +35,7 @@
   ],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -92,7 +92,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
@ -194,7 +194,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client =  LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
@ -81,7 +81,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1 +1 @@
-The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
+The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -17,7 +17,7 @@ import fire
 import ruamel.yaml as yaml
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION  # noqa: E402
-from llama_stack.distribution.stack import LlamaStack  # noqa: E402
+from llama_stack.core.stack import LlamaStack  # noqa: E402
 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -12,7 +12,7 @@ from typing import TextIO
 from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
-from llama_stack.distribution.resolver import api_protocol_map
+from llama_stack.core.resolver import api_protocol_map
 from .generator import Generator
 from .options import Options
--- a/docs/original_rfc.md
+++ b/docs/original_rfc.md
@ -73,7 +73,7 @@ The API is defined in the [YAML](_static/llama-stack-spec.yaml) and [HTML](_stat
 To prove out the API, we implemented a handful of use cases to make things more concrete. The [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps) repository contains [6 different examples](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) ranging from very basic to a multi turn agent.
-There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/distribution/server/server.py) repository.
+There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack.core/server/server.py) repository.
 ## Limitations
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -145,12 +145,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-        "!uv run --with llama-stack llama stack build --template starter --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"uv run --with llama-stack llama stack run starter --image-type venv --env INFERENCE_MODEL=llama3.2:3b\",\n",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
@ -187,7 +187,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
@ -249,12 +249,6 @@
      ],
      "source": [
        "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
        "import os\n",
        "\n",
        "os.environ[\"ENABLE_OLLAMA\"] = \"ollama\"\n",
        "os.environ[\"OLLAMA_INFERENCE_MODEL\"] = \"llama3.2:3b\"\n",
        "os.environ[\"OLLAMA_EMBEDDING_MODEL\"] = \"all-minilm:l6-v2\"\n",
        "os.environ[\"OLLAMA_EMBEDDING_DIMENSION\"] = \"384\"\n",
        "\n",
        "vector_db_id = \"my_demo_vector_db\"\n",
        "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
--- a/docs/source/advanced_apis/eval/index.md
+++ b/docs/source/advanced_apis/eval/index.md
@ -0,0 +1,6 @@
 # Eval Providers
 This section contains documentation for all available providers for the **eval** API.
 - [inline::meta-reference](inline_meta-reference.md)
 - [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/eval/inline_meta-reference.md
+++ b/docs/source/advanced_apis/eval/inline_meta-reference.md
@ -0,0 +1,25 @@
 ---
 orphan: true
 ---
 # inline::meta-reference
 ## Description
 Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
 ## Sample Configuration
 ```yaml
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
 ```
--- a/docs/source/advanced_apis/eval/remote_nvidia.md
+++ b/docs/source/advanced_apis/eval/remote_nvidia.md
@ -0,0 +1,23 @@
 ---
 orphan: true
 ---
 # remote::nvidia
 ## Description
 NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
 ## Sample Configuration
 ```yaml
 evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
 ```
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
@ -43,7 +43,7 @@ We have built-in functionality to run the supported open-benckmarks using llama-
 Spin up llama stack server with 'open-benchmark' template
 ```
-llama stack run llama_stack/templates/open-benchmark/run.yaml
+llama stack run llama_stack/distributions/open-benchmark/run.yaml
 ```
--- a/docs/source/advanced_apis/index.md
+++ b/docs/source/advanced_apis/index.md
@ -0,0 +1,33 @@
 # Advanced APIs
 ## Post-training
 Fine-tunes a model.
 ```{toctree}
 :maxdepth: 1
 post_training/index
 ```
 ## Eval
 Generates outputs (via Inference or Agents) and perform scoring.
 ```{toctree}
 :maxdepth: 1
 eval/index
 ```
 ```{include} evaluation_concepts.md
 :start-after: ## Evaluation Concepts
 ```
 ## Scoring
 Evaluates the outputs of the system.
 ```{toctree}
 :maxdepth: 1
 scoring/index
 ```
--- a/docs/source/advanced_apis/post_training/huggingface.md
+++ b/docs/source/advanced_apis/post_training/huggingface.md
@ -23,7 +23,7 @@ To use the HF SFTTrainer in your Llama Stack project, follow these steps:
 You can access the HuggingFace trainer via the `ollama` distribution:
 ```bash
-llama stack build --template starter --image-type venv
+llama stack build --distro starter --image-type venv
 llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
 ```
--- a/docs/source/advanced_apis/post_training/index.md
+++ b/docs/source/advanced_apis/post_training/index.md
@ -0,0 +1,7 @@
 # Post_Training Providers
 This section contains documentation for all available providers for the **post_training** API.
 - [inline::huggingface](inline_huggingface.md)
 - [inline::torchtune](inline_torchtune.md)
 - [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/post_training/inline_huggingface.md
+++ b/docs/source/advanced_apis/post_training/inline_huggingface.md
@ -0,0 +1,37 @@
 ---
 orphan: true
 ---
 # inline::huggingface
 ## Description
 HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `device` | `<class 'str'>` | No | cuda |  |
 | `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
 | `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
 | `chat_template` | `<class 'str'>` | No | |
 | `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
 | `max_seq_length` | `<class 'int'>` | No | 2048 |  |
 | `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
 | `save_total_limit` | `<class 'int'>` | No | 3 |  |
 | `logging_steps` | `<class 'int'>` | No | 10 |  |
 | `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
 | `weight_decay` | `<class 'float'>` | No | 0.01 |  |
 | `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
 | `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
 ```
--- a/docs/source/advanced_apis/post_training/inline_torchtune.md
+++ b/docs/source/advanced_apis/post_training/inline_torchtune.md
@ -0,0 +1,24 @@
 ---
 orphan: true
 ---
 # inline::torchtune
 ## Description
 TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
 | `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: meta
 ```
--- a/docs/source/advanced_apis/post_training/nvidia_nemo.md
+++ b/docs/source/advanced_apis/post_training/nvidia_nemo.md
--- a/docs/source/advanced_apis/post_training/remote_nvidia.md
+++ b/docs/source/advanced_apis/post_training/remote_nvidia.md
@ -0,0 +1,32 @@
 ---
 orphan: true
 ---
 # remote::nvidia
 ## Description
 NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The NVIDIA API key. |
 | `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
 | `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
 | `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
 | `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
 | `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
 | `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
 ## Sample Configuration
 ```yaml
 api_key: ${env.NVIDIA_API_KEY:=}
 dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
 project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
 customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
 ```
--- a/docs/source/advanced_apis/post_training/torchtune.md
+++ b/docs/source/advanced_apis/post_training/torchtune.md
--- a/docs/source/advanced_apis/scoring/index.md
+++ b/docs/source/advanced_apis/scoring/index.md
@ -0,0 +1,7 @@
 # Scoring Providers
 This section contains documentation for all available providers for the **scoring** API.
 - [inline::basic](inline_basic.md)
 - [inline::braintrust](inline_braintrust.md)
 - [inline::llm-as-judge](inline_llm-as-judge.md)
--- a/docs/source/advanced_apis/scoring/inline_basic.md
+++ b/docs/source/advanced_apis/scoring/inline_basic.md
@ -0,0 +1,17 @@
 ---
 orphan: true
 ---
 # inline::basic
 ## Description
 Basic scoring provider for simple evaluation metrics and scoring functions.
 ## Sample Configuration
 ```yaml
 {}
 ```
--- a/docs/source/advanced_apis/scoring/inline_braintrust.md
+++ b/docs/source/advanced_apis/scoring/inline_braintrust.md
@ -0,0 +1,23 @@
 ---
 orphan: true
 ---
 # inline::braintrust
 ## Description
 Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `openai_api_key` | `str \| None` | No |  | The OpenAI API Key |
 ## Sample Configuration
 ```yaml
 openai_api_key: ${env.OPENAI_API_KEY:=}
 ```
--- a/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
+++ b/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
@ -0,0 +1,17 @@
 ---
 orphan: true
 ---
 # inline::llm-as-judge
 ## Description
 LLM-as-judge scoring provider that uses language models to evaluate and score responses.
 ## Sample Configuration
 ```yaml
 {}
 ```
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -0,0 +1,392 @@
 # External APIs
 Llama Stack supports external APIs that live outside of the main codebase. This allows you to:
 - Create and maintain your own APIs independently
 - Share APIs with others without contributing to the main codebase
 - Keep API-specific code separate from the core Llama Stack code
 ## Configuration
 To enable external APIs, you need to configure the `external_apis_dir` in your Llama Stack configuration. This directory should contain your external API specifications:
 ```yaml
 external_apis_dir: ~/.llama/apis.d/
 ```
 ## Directory Structure
 The external APIs directory should follow this structure:
 ```
 apis.d/
  custom_api1.yaml
  custom_api2.yaml
 ```
 Each YAML file in these directories defines an API specification.
 ## API Specification
 Here's an example of an external API specification for a weather API:
 ```yaml
 module: weather
 api_dependencies:
  - inference
 protocol: WeatherAPI
 name: weather
 pip_packages:
  - llama-stack-api-weather
 ```
 ### API Specification Fields
 - `module`: Python module containing the API implementation
 - `protocol`: Name of the protocol class for the API
 - `name`: Name of the API
 - `pip_packages`: List of pip packages to install the API, typically a single package
 ## Required Implementation
 External APIs must expose a `available_providers()` function in their module that returns a list of provider names:
 ```python
 # llama_stack_api_weather/api.py
 from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
 def available_providers() -> list[ProviderSpec]:
    return [
        InlineProviderSpec(
            api=Api.weather,
            provider_type="inline::darksky",
            pip_packages=[],
            module="llama_stack_provider_darksky",
            config_class="llama_stack_provider_darksky.DarkSkyWeatherImplConfig",
        ),
    ]
 ```
 A Protocol class like so:
 ```python
 # llama_stack_api_weather/api.py
 from typing import Protocol
 from llama_stack.schema_utils import webmethod
 class WeatherAPI(Protocol):
    """
    A protocol for the Weather API.
    """
    @webmethod(route="/locations", method="GET")
    async def get_available_locations() -> dict[str, list[str]]:
        """
        Get the available locations.
        """
        ...
 ```
 ## Example: Custom API
 Here's a complete example of creating and using a custom API:
 1. First, create the API package:
 ```bash
 mkdir -p llama-stack-api-weather
 cd llama-stack-api-weather
 mkdir src/llama_stack_api_weather
 git init
 uv init
 ```
 2. Edit `pyproject.toml`:
 ```toml
 [project]
 name = "llama-stack-api-weather"
 version = "0.1.0"
 description = "Weather API for Llama Stack"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic"]
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["llama_stack_api_weather", "llama_stack_api_weather.*"]
 ```
 3. Create the initial files:
 ```bash
 touch src/llama_stack_api_weather/__init__.py
 touch src/llama_stack_api_weather/api.py
 ```
 ```python
 # llama-stack-api-weather/src/llama_stack_api_weather/__init__.py
 """Weather API for Llama Stack."""
 from .api import WeatherAPI, available_providers
 __all__ = ["WeatherAPI", "available_providers"]
 ```
 4. Create the API implementation:
 ```python
 # llama-stack-api-weather/src/llama_stack_api_weather/weather.py
 from typing import Protocol
 from llama_stack.providers.datatypes import (
    AdapterSpec,
    Api,
    ProviderSpec,
    RemoteProviderSpec,
 )
 from llama_stack.schema_utils import webmethod
 def available_providers() -> list[ProviderSpec]:
    return [
        RemoteProviderSpec(
            api=Api.weather,
            provider_type="remote::kaze",
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
            adapter=AdapterSpec(
                adapter_type="kaze",
                module="llama_stack_provider_kaze",
                pip_packages=["llama_stack_provider_kaze"],
                config_class="llama_stack_provider_kaze.KazeProviderConfig",
            ),
        ),
    ]
 class WeatherProvider(Protocol):
    """
    A protocol for the Weather API.
    """
    @webmethod(route="/weather/locations", method="GET")
    async def get_available_locations() -> dict[str, list[str]]:
        """
        Get the available locations.
        """
        ...
 ```
 5. Create the API specification:
 ```yaml
 # ~/.llama/apis.d/weather.yaml
 module: llama_stack_api_weather
 name: weather
 pip_packages: ["llama-stack-api-weather"]
 protocol: WeatherProvider
 ```
 6. Install the API package:
 ```bash
 uv pip install -e .
 ```
 7. Configure Llama Stack to use external APIs:
 ```yaml
 version: "2"
 image_name: "llama-stack-api-weather"
 apis:
  - weather
 providers: {}
 external_apis_dir: ~/.llama/apis.d
 ```
 The API will now be available at `/v1/weather/locations`.
 ## Example: custom provider for the weather API
 1. Create the provider package:
 ```bash
 mkdir -p llama-stack-provider-kaze
 cd llama-stack-provider-kaze
 uv init
 ```
 2. Edit `pyproject.toml`:
 ```toml
 [project]
 name = "llama-stack-provider-kaze"
 version = "0.1.0"
 description = "Kaze weather provider for Llama Stack"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic", "aiohttp"]
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["llama_stack_provider_kaze", "llama_stack_provider_kaze.*"]
 ```
 3. Create the initial files:
 ```bash
 touch src/llama_stack_provider_kaze/__init__.py
 touch src/llama_stack_provider_kaze/kaze.py
 ```
 4. Create the provider implementation:
 Initialization function:
 ```python
 # llama-stack-provider-kaze/src/llama_stack_provider_kaze/__init__.py
 """Kaze weather provider for Llama Stack."""
 from .config import KazeProviderConfig
 from .kaze import WeatherKazeAdapter
 __all__ = ["KazeProviderConfig", "WeatherKazeAdapter"]
 async def get_adapter_impl(config: KazeProviderConfig, _deps):
    from .kaze import WeatherKazeAdapter
    impl = WeatherKazeAdapter(config)
    await impl.initialize()
    return impl
 ```
 Configuration:
 ```python
 # llama-stack-provider-kaze/src/llama_stack_provider_kaze/config.py
 from pydantic import BaseModel, Field
 class KazeProviderConfig(BaseModel):
    """Configuration for the Kaze weather provider."""
    base_url: str = Field(
        "https://api.kaze.io/v1",
        description="Base URL for the Kaze weather API",
    )
 ```
 Main implementation:
 ```python
 # llama-stack-provider-kaze/src/llama_stack_provider_kaze/kaze.py
 from llama_stack_api_weather.api import WeatherProvider
 from .config import KazeProviderConfig
 class WeatherKazeAdapter(WeatherProvider):
    """Kaze weather provider implementation."""
    def __init__(
        self,
        config: KazeProviderConfig,
    ) -> None:
        self.config = config
    async def initialize(self) -> None:
        pass
    async def get_available_locations(self) -> dict[str, list[str]]:
        """Get available weather locations."""
        return {"locations": ["Paris", "Tokyo"]}
 ```
 5. Create the provider specification:
 ```yaml
 # ~/.llama/providers.d/remote/weather/kaze.yaml
 adapter:
  adapter_type: kaze
  pip_packages: ["llama_stack_provider_kaze"]
  config_class: llama_stack_provider_kaze.config.KazeProviderConfig
  module: llama_stack_provider_kaze
 optional_api_dependencies: []
 ```
 6. Install the provider package:
 ```bash
 uv pip install -e .
 ```
 7. Configure Llama Stack to use the provider:
 ```yaml
 # ~/.llama/run-byoa.yaml
 version: "2"
 image_name: "llama-stack-api-weather"
 apis:
  - weather
 providers:
  weather:
  - provider_id: kaze
    provider_type: remote::kaze
    config: {}
 external_apis_dir: ~/.llama/apis.d
 external_providers_dir: ~/.llama/providers.d
 server:
  port: 8321
 ```
 8. Run the server:
 ```bash
 python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
 ```
 9. Test the API:
 ```bash
 curl -sSf http://127.0.0.1:8321/v1/weather/locations
 {"locations":["Paris","Tokyo"]}%
 ```
 ## Best Practices
 1. **Package Naming**: Use a clear and descriptive name for your API package.
 2. **Version Management**: Keep your API package versioned and compatible with the Llama Stack version you're using.
 3. **Dependencies**: Only include the minimum required dependencies in your API package.
 4. **Documentation**: Include clear documentation in your API package about:
   - Installation requirements
   - Configuration options
   - API endpoints and usage
   - Any limitations or known issues
 5. **Testing**: Include tests in your API package to ensure it works correctly with Llama Stack.
 ## Troubleshooting
 If your external API isn't being loaded:
 1. Check that the `external_apis_dir` path is correct and accessible.
 2. Verify that the YAML files are properly formatted.
 3. Ensure all required Python packages are installed.
 4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more information using `LLAMA_STACK_LOGGING=all=debug`.
 5. Verify that the API package is installed in your Python environment.
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications (Examples)
+# AI Application Examples
 Llama Stack provides all the building blocks needed to create sophisticated AI applications.
@ -11,6 +11,7 @@ Here are some key topics that will help you build effective agents:
 - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
 - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
 - **[Agents vs Responses API](responses_vs_agents)**: Learn the differences between the Agents API and Responses API, and when to use each one.
 - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
 - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
 - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
@ -23,8 +24,10 @@ Here are some key topics that will help you build effective agents:
 rag
 agent
 agent_execution_loop
 responses_vs_agents
 tools
 evals
 telemetry
 safety
-```
+playground/index
 ```
--- a/docs/source/building_applications/playground/index.md
+++ b/docs/source/building_applications/playground/index.md
@ -1,4 +1,4 @@
-# Llama Stack Playground
+## Llama Stack Playground
 ```{note}
 The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
 - Demo **end-to-end** application code to help users get started to build their own applications
 - Provide an **UI** to help users inspect and understand Llama Stack API providers and resources
-## Key Features
+### Key Features
 #### Playground
 Interactive pages for users to play with and explore Llama Stack API capabilities.
@ -90,18 +90,18 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
  - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
  - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
-## Starting the Llama Stack Playground
+### Starting the Llama Stack Playground
 To start the Llama Stack Playground, run the following commands:
 1. Start up the Llama Stack API server
 ```bash
-llama stack build --template together --image-type conda
+llama stack build --distro together --image-type venv
 llama stack run together
 ```
 2. Start Streamlit UI
 ```bash
-uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
+uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -0,0 +1,177 @@
 # Agents vs OpenAI Responses API
 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
 > **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
 ## Overview
 ### LLS Agents API
 The Agents API is a full-featured, stateful system designed for complex, multi-turn conversations. It maintains conversation state through persistent sessions identified by a unique session ID. The API supports comprehensive agent lifecycle management, detailed execution tracking, and rich metadata about each interaction through a structured session/turn/step hierarchy. The API can orchestrate multiple tool calls within a single turn.
 ### OpenAI Responses API
 The OpenAI Responses API is a full-featured, stateful system designed for complex, multi-turn conversations, with direct compatibility with OpenAI's conversational patterns enhanced by LLama Stack's tool calling capabilities. It maintains conversation state by chaining responses through a `previous_response_id`, allowing interactions to branch or continue from any prior point. Each response can perform multiple tool calls within a single turn.
 ### Key Differences
 The LLS Agents API uses the Chat Completions API on the backend for inference as it's the industry standard for building AI applications and most LLM providers are compatible with this API. For a detailed comparison between Responses and Chat Completions, see [OpenAI's documentation](https://platform.openai.com/docs/guides/responses-vs-chat-completions).
 Additionally, Agents let you specify input/output shields whereas Responses do not (though support is planned). Agents use a linear conversation model referenced by a single session ID. Responses, on the other hand, support branching, where each response can serve as a fork point, and conversations are tracked by the latest response ID. Responses also lets you dynamically choose the model, vector store, files, MCP servers, and more on each inference call, enabling more complex workflows. Agents require a static configuration for these components at the start of the session.
 Today the Agents and Responses APIs can be used independently depending on the use case. But, it is also productive to treat the APIs as complementary. It is not currently supported, but it is planned for the LLS Agents API to alternatively use the Responses API as its backend instead of the default Chat Completions API, i.e., enabling a combination of the safety features of Agents with the dynamic configuration and branching capabilities of Responses.
 | Feature | LLS Agents API | OpenAI Responses API |
 |---------|------------|---------------------|
 | **Conversation Management** | Linear persistent sessions | Can branch from any previous response ID |
 | **Input/Output Safety Shields** | Supported | Not yet supported |
 | **Per-call Flexibility** | Static per-session configuration | Dynamic per-call configuration |
 ## Use Case Example: Research with Multiple Search Methods
 Let's compare how both APIs handle a research task where we need to:
 1. Search for current information and examples
 2. Access different information sources dynamically
 3. Continue the conversation based on search results
 ### Agents API: Session-based configuration with safety shields
 ```python
 # Create agent with static session configuration
 agent = Agent(
    client,
    model="Llama3.2-3B-Instruct",
    instructions="You are a helpful coding assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["code_docs"]},
        },
        "builtin::code_interpreter",
    ],
    input_shields=["llama_guard"],
    output_shields=["llama_guard"],
 )
 session_id = agent.create_session("code_session")
 # First turn: Search and execute
 response1 = agent.create_turn(
    messages=[
        {
            "role": "user",
            "content": "Find examples of sorting algorithms and run a bubble sort on [3,1,4,1,5]",
        },
    ],
    session_id=session_id,
 )
 # Continue conversation in same session
 response2 = agent.create_turn(
    messages=[
        {
            "role": "user",
            "content": "Now optimize that code and test it with a larger dataset",
        },
    ],
    session_id=session_id,  # Same session, maintains full context
 )
 # Agents API benefits:
 # ✅ Safety shields protect against malicious code execution
 # ✅ Session maintains context between code executions
 # ✅ Consistent tool configuration throughout conversation
 print(f"First result: {response1.output_message.content}")
 print(f"Optimization: {response2.output_message.content}")
 ```
 ### Responses API: Dynamic per-call configuration with branching
 ```python
 # First response: Use web search for latest algorithms
 response1 = client.responses.create(
    model="Llama3.2-3B-Instruct",
    input="Search for the latest efficient sorting algorithms and their performance comparisons",
    tools=[
        {
            "type": "web_search",
        },
    ],  # Web search for current information
 )
 # Continue conversation: Switch to file search for local docs
 response2 = client.responses.create(
    model="Llama3.2-1B-Instruct",  # Switch to faster model
    input="Now search my uploaded files for existing sorting implementations",
    tools=[
        {  # Using Responses API built-in tools
            "type": "file_search",
            "vector_store_ids": ["vs_abc123"],  # Vector store containing uploaded files
        },
    ],
    previous_response_id=response1.id,
 )
 # Branch from first response: Try different search approach
 response3 = client.responses.create(
    model="Llama3.2-3B-Instruct",
    input="Instead, search the web for Python-specific sorting best practices",
    tools=[{"type": "web_search"}],  # Different web search query
    previous_response_id=response1.id,  # Branch from response1
 )
 # Responses API benefits:
 # ✅ Dynamic tool switching (web search ↔ file search per call)
 # ✅ OpenAI-compatible tool patterns (web_search, file_search)
 # ✅ Branch conversations to explore different information sources
 # ✅ Model flexibility per search type
 print(f"Web search results: {response1.output_message.content}")
 print(f"File search results: {response2.output_message.content}")
 print(f"Alternative web search: {response3.output_message.content}")
 ```
 Both APIs demonstrate distinct strengths that make them valuable on their own for different scenarios. The Agents API excels in providing structured, safety-conscious workflows with persistent session management, while the Responses API offers flexibility through dynamic configuration and OpenAI compatible tool patterns.
 ## Use Case Examples
 ### 1. **Research and Analysis with Safety Controls**
 **Best Choice: Agents API**
 **Scenario:** You're building a research assistant for a financial institution that needs to analyze market data, execute code to process financial models, and search through internal compliance documents. The system must ensure all interactions are logged for regulatory compliance and protected by safety shields to prevent malicious code execution or data leaks.
 **Why Agents API?** The Agents API provides persistent session management for iterative research workflows, built-in safety shields to protect against malicious code in financial models, and structured execution logs (session/turn/step) required for regulatory compliance. The static tool configuration ensures consistent access to your knowledge base and code interpreter throughout the entire research session.
 ### 2. **Dynamic Information Gathering with Branching Exploration**
 **Best Choice: Responses API**
 **Scenario:** You're building a competitive intelligence tool that helps businesses research market trends. Users need to dynamically switch between web search for current market data and file search through uploaded industry reports. They also want to branch conversations to explore different market segments simultaneously and experiment with different models for various analysis types.
 **Why Responses API?** The Responses API's branching capability lets users explore multiple market segments from any research point. Dynamic per-call configuration allows switching between web search and file search as needed, while experimenting with different models (faster models for quick searches, more powerful models for deep analysis). The OpenAI-compatible tool patterns make integration straightforward.
 ### 3. **OpenAI Migration with Advanced Tool Capabilities**
 **Best Choice: Responses API**
 **Scenario:** You have an existing application built with OpenAI's Assistants API that uses file search and web search capabilities. You want to migrate to Llama Stack for better performance and cost control while maintaining the same tool calling patterns and adding new capabilities like dynamic vector store selection.
 **Why Responses API?** The Responses API provides full OpenAI tool compatibility (`web_search`, `file_search`) with identical syntax, making migration seamless. The dynamic per-call configuration enables advanced features like switching vector stores per query or changing models based on query complexity - capabilities that extend beyond basic OpenAI functionality while maintaining compatibility.
 ### 4. **Educational Programming Tutor**
 **Best Choice: Agents API**
 **Scenario:** You're building a programming tutor that maintains student context across multiple sessions, safely executes code exercises, and tracks learning progress with audit trails for educators.
 **Why Agents API?** Persistent sessions remember student progress across multiple interactions, safety shields prevent malicious code execution while allowing legitimate programming exercises, and structured execution logs help educators track learning patterns.
 ### 5. **Advanced Software Debugging Assistant**
 **Best Choice: Agents API with Responses Backend**
 **Scenario:** You're building a debugging assistant that helps developers troubleshoot complex issues. It needs to maintain context throughout a debugging session, safely execute diagnostic code, switch between different analysis tools dynamically, and branch conversations to explore multiple potential causes simultaneously.
 **Why Agents + Responses?** The Agent provides safety shields for code execution and session management for the overall debugging workflow. The underlying Responses API enables dynamic model selection and flexible tool configuration per query, while branching lets you explore different theories (memory leak vs. concurrency issue) from the same debugging point and compare results.
 > **Note:** The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
 ## For More Information
 - **LLS Agents API**: For detailed information on creating and managing agents, see the [Agents documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html)
 - **OpenAI Responses API**: For information on using the OpenAI-compatible responses API, see the [OpenAI API documentation](https://platform.openai.com/docs/api-reference/responses)
 - **Chat Completions API**: For the default backend API used by Agents, see the [Chat Completions providers documentation](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions)
 - **Agent Execution Loop**: For understanding how agents process turns and steps in their execution, see the [Agent Execution Loop documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent_execution_loop.html)
--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -10,9 +10,11 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
 - **Telemetry**: collect telemetry data from the system
 - **Post Training**: fine-tune a model
 - **Tool Runtime**: interact with various tools and protocols
 - **Responses**: generate responses from an LLM using this OpenAI compatible API.
 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Post Training**: fine-tune a model
 - **Synthetic Data Generation**: generate synthetic data for model development
--- a/docs/source/concepts/architecture.md
+++ b/docs/source/concepts/architecture.md
@ -1,31 +1,39 @@
-# Why Llama Stack?
+## Llama Stack architecture
-Building production AI applications today requires solving multiple challenges:
+Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.
 **Infrastructure Complexity**
 - Running large language models efficiently requires specialized infrastructure.
 - Different deployment scenarios (local development, cloud, edge) need different solutions.
 - Moving from development to production often requires significant rework.
 **Essential Capabilities**
 - Safety guardrails and content filtering are necessary in an enterprise setting.
 - Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
 - Nearly any application needs composable multi-step workflows.
 - Finally, without monitoring, observability and evaluation, you end up operating in the dark.
 **Lack of Flexibility and Choice**
 - Directly integrating with multiple providers creates tight coupling.
 - Different providers have different APIs and abstractions.
 - Changing providers requires significant code changes.
 ### Our Solution: A Universal Stack
 ```{image} ../../_static/llama-stack.png
 :alt: Llama Stack
 :width: 400px
 ```
 ### Benefits of Llama stack
 #### Current challenges in custom AI applications
 Building production AI applications today requires solving multiple challenges:
 **Infrastructure Complexity**
 - Running large language models efficiently requires specialized infrastructure.
 - Different deployment scenarios (local development, cloud, edge) need different solutions.
 - Moving from development to production often requires significant rework.
 **Essential Capabilities**
 - Safety guardrails and content filtering are necessary in an enterprise setting.
 - Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
 - Nearly any application needs composable multi-step workflows.
 - Without monitoring, observability and evaluation, you end up operating in the dark.
 **Lack of Flexibility and Choice**
 - Directly integrating with multiple providers creates tight coupling.
 - Different providers have different APIs and abstractions.
 - Changing providers requires significant code changes.
 #### Our Solution: A Universal Stack
 Llama Stack addresses these challenges through a service-oriented, API-first approach:
 **Develop Anywhere, Deploy Everywhere**
@ -59,4 +67,4 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
 - **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios
-With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
+With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -2,6 +2,10 @@
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
 ```{include} architecture.md
 :start-after: ## Llama Stack architecture
 ```
 ```{include} apis.md
 :start-after: ## APIs
 ```
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
 :start-after: ## API Providers
 ```
 ```{include} resources.md
 :start-after: ## Resources
 ```
 ```{include} distributions.md
 :start-after: ## Distributions
 ```
-```{include} evaluation_concepts.md
+```{include} resources.md
-:start-after: ## Evaluation Concepts
+:start-after: ## Resources
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -52,7 +52,18 @@ extensions = [
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
    "sphinx_reredirects"
 ]
 redirects = {
    "providers/post_training/index": "../../advanced_apis/post_training/index.html",
    "providers/eval/index": "../../advanced_apis/eval/index.html",
    "providers/scoring/index": "../../advanced_apis/scoring/index.html",
    "playground/index": "../../building_applications/playground/index.html",
    "openai/index": "../../providers/index.html#openai-api-compatibility",
    "introduction/index": "../concepts/index.html#llama-stack-architecture"
 }
 myst_enable_extensions = ["colon_fence"]
 html_theme = "sphinx_rtd_theme"
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -11,4 +11,5 @@ See the [Adding a New API Provider](new_api_provider.md) which describes how to
 :hidden:
 new_api_provider
 testing
 ```
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 Here are some example PRs to help you get started:
@ -14,10 +14,45 @@ Here are some example PRs to help you get started:
   - [Nvidia Inference Implementation](https://github.com/meta-llama/llama-stack/pull/355)
   - [Model context protocol Tool Runtime](https://github.com/meta-llama/llama-stack/pull/665)
 ## Inference Provider Patterns
 When implementing Inference providers for OpenAI-compatible APIs, Llama Stack provides several mixin classes to simplify development and ensure consistent behavior across providers.
 ### OpenAIMixin
 The `OpenAIMixin` class provides direct OpenAI API functionality for providers that work with OpenAI-compatible endpoints. It includes:
 #### Direct API Methods
 - **`openai_completion()`**: Legacy text completion API with full parameter support
 - **`openai_chat_completion()`**: Chat completion API supporting streaming, tools, and function calling
 - **`openai_embeddings()`**: Text embeddings generation with customizable encoding and dimensions
 #### Model Management
 - **`check_model_availability()`**: Queries the API endpoint to verify if a model exists and is accessible
 #### Client Management
 - **`client` property**: Automatically creates and configures AsyncOpenAI client instances using your provider's credentials
 #### Required Implementation
 To use `OpenAIMixin`, your provider must implement these abstract methods:
 ```python
@abstractmethod
 def get_api_key(self) -> str:
    """Return the API key for authentication"""
    pass
@abstractmethod
 def get_base_url(self) -> str:
    """Return the OpenAI-compatible API base URL"""
    pass
 ```
 ## Testing the Provider
-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
 ### 1. Integration Testing
--- a/docs/source/deploying/index.md
+++ b/docs/source/deploying/index.md
@ -0,0 +1,4 @@
 # Deployment Examples
 ```{include} kubernetes_deployment.md
 ```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,4 +1,4 @@
-# Kubernetes Deployment Guide
+## Kubernetes Deployment Guide
 Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
@ -174,7 +174,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
@ -222,10 +222,21 @@ llama-stack-client --endpoint http://localhost:5000 inference chat-completion --
 ## Deploying Llama Stack Server in AWS EKS
-We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster. Once you have an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html), you can run the following script to deploy the Llama Stack server.
+We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster.
 Prerequisites:
 - Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html).
 - Create a [Github OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) and get the client ID and client secret.
  - Set the `Authorization callback URL` to `http://<your-llama-stack-ui-url>/api/auth/callback/`
 Run the following script to deploy the Llama Stack server:
 ```
 export HF_TOKEN=<your-huggingface-token>
 export GITHUB_CLIENT_ID=<your-github-client-id>
 export GITHUB_CLIENT_SECRET=<your-github-client-secret>
 export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
 cd docs/source/distributions/eks
 ./apply.sh
 ```
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -47,30 +47,37 @@ pip install -e .
 ```
 Use the CLI to build your distribution.
 The main points to consider are:
-1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
+1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
 2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
 3. **Config** - Do you want to use a pre-existing config file to build your distribution?
 ```
 llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
                         [--run] [--providers PROVIDERS]
 Build a Llama stack container
 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
-                        be prompted to enter information interactively (default: None)
+                        enter information interactively (default: None)
-  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
+  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
-  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
+                        None)
-  --image-type {conda,container,venv}
+  --distro DISTRIBUTION, --distribution DISTRIBUTION
                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
  --list-distros, --list-distributions
                        Show the available distributions for building a Llama Stack distribution (default: False)
  --image-type {container,venv}
                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
  --image-name IMAGE_NAME
-                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active environment will be used if
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
-                        found. (default: None)
+                        None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
-
+  --providers PROVIDERS
                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
                        API. (default: None)
 ```
 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
@ -141,10 +148,14 @@ You may then pick a template to build your distribution with providers fitted to
 For example, to build a distribution with TGI as the inference provider, you can run:
 ```
-$ llama stack build --template starter
+$ llama stack build --distro starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
 ```{tip}
 The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
 :::
 :::{tab-item} Building from Scratch
@ -155,7 +166,7 @@ It would be best to start with a template and understand the structure of the co
 llama stack build
 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
+> Enter the image type you want your Llama Stack to be built as (container or venv): venv
 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
@ -180,10 +191,10 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 :::{tab-item} Building from a pre-existing build config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.
+- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
 ```
-llama stack build --config llama_stack/templates/starter/build.yaml
+llama stack build --config llama_stack/distributions/starter/build.yaml
 ```
 :::
@ -249,11 +260,11 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
 ```
-llama stack build --template starter --image-type container
+llama stack build --distro starter --image-type container
 ```
 ```
-$ llama stack build --template starter --image-type container
+$ llama stack build --distro starter --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
@ -308,7 +319,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 ```
 llama stack run -h
 usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
-                       [--image-type {conda,venv}] [--enable-ui]
+                       [--image-type {venv}] [--enable-ui]
                       [config | template]
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
@ -322,8 +333,8 @@ options:
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current environment (default: None)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
-  --image-type {conda,venv}
+  --image-type {venv}
-                        Image Type used during the build. This can be either conda or venv. (default: None)
+                        Image Type used during the build. This should be venv. (default: None)
  --enable-ui           Start the UI server (default: False)
 ```
@ -338,9 +349,6 @@ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-
 # Start using a venv
 llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 # Start using a conda environment
 llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```
 ```
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -2,11 +2,14 @@
 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
 ```{note}
 The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
 ```{dropdown} 👋 Click here for a Sample Configuration File
 ```yaml
 version: 2
 conda_env: ollama
 apis:
 - agents
 - inference
@ -381,6 +384,166 @@ And must respond with:
 If no access attributes are returned, the token is used as a namespace.
 ### Access control
 When authentication is enabled, access to resources is controlled
 through the `access_policy` attribute of the auth config section under
 server. The value for this is a list of access rules.
 Each access rule defines a list of actions either to permit or to
 forbid. It may specify a principal or a resource that must match for
 the rule to take effect.
 Valid actions are create, read, update, and delete. The resource to
 match should be specified in the form of a type qualified identifier,
 e.g.  model::my-model or vector_db::some-db, or a wildcard for all
 resources of a type, e.g. model::*. If the principal or resource are
 not specified, they will match all requests.
 The valid resource types are model, shield, vector_db, dataset,
 scoring_function, benchmark, tool, tool_group and session.
 A rule may also specify a condition, either a 'when' or an 'unless',
 with additional constraints as to where the rule applies. The
 constraints supported at present are:
 - 'user with <attr-value> in <attr-name>'
 - 'user with <attr-value> not in <attr-name>'
 - 'user is owner'
 - 'user is not owner'
 - 'user in owners <attr-name>'
 - 'user not in owners <attr-name>'
 The attributes defined for a user will depend on how the auth
 configuration is defined.
 When checking whether a particular action is allowed by the current
 user for a resource, all the defined rules are tested in order to find
 a match. If a match is found, the request is permitted or forbidden
 depending on the type of rule. If no match is found, the request is
 denied.
 If no explicit rules are specified, a default policy is defined with
 which all users can access all resources defined in config but
 resources created dynamically can only be accessed by the user that
 created them.
 Examples:
 The following restricts access to particular github users:
 ```yaml
 server:
  auth:
    provider_config:
      type: "github_token"
      github_api_base_url: "https://api.github.com"
  access_policy:
  - permit:
      principal: user-1
      actions: [create, read, delete]
    description: user-1 has full access to all resources
  - permit:
      principal: user-2
      actions: [read]
      resource: model::model-1
    description: user-2 has read access to model-1 only
 ```
 Similarly, the following restricts access to particular kubernetes
 service accounts:
 ```yaml
 server:
  auth:
    provider_config:
      type: "oauth2_token"
      audience: https://kubernetes.default.svc.cluster.local
      issuer: https://kubernetes.default.svc.cluster.local
      tls_cafile: /home/gsim/.minikube/ca.crt
      jwks:
        uri: https://kubernetes.default.svc.cluster.local:8443/openid/v1/jwks
        token: ${env.TOKEN}
    access_policy:
    - permit:
        principal: system:serviceaccount:my-namespace:my-serviceaccount
        actions: [create, read, delete]
      description: specific serviceaccount has full access to all resources
    - permit:
        principal: system:serviceaccount:default:default
        actions: [read]
        resource: model::model-1
      description: default account has read access to model-1 only
 ```
 The following policy, which assumes that users are defined with roles
 and teams by whichever authentication system is in use, allows any
 user with a valid token to use models, create resources other than
 models, read and delete resources they created and read resources
 created by users sharing a team with them:
 ```
    access_policy:
    - permit:
        actions: [read]
        resource: model::*
      description: all users have read access to models
    - forbid:
        actions: [create, delete]
        resource: model::*
      unless: user with admin in roles
      description: only user with admin role can create or delete models
    - permit:
        actions: [create, read, delete]
      when: user is owner
      description: users can create resources other than models and read and delete those they own
    - permit:
        actions: [read]
      when: user in owner teams
      description: any user has read access to any resource created by a user with the same team
 ```
 #### API Endpoint Authorization with Scopes
 In addition to resource-based access control, Llama Stack supports endpoint-level authorization using OAuth 2.0 style scopes. When authentication is enabled, specific API endpoints require users to have particular scopes in their authentication token.
 **Scope-Gated APIs:**
 The following APIs are currently gated by scopes:
 - **Telemetry API** (scope: `telemetry.read`):
  - `POST /telemetry/traces` - Query traces
  - `GET /telemetry/traces/{trace_id}` - Get trace by ID
  - `GET /telemetry/traces/{trace_id}/spans/{span_id}` - Get span by ID
  - `POST /telemetry/spans/{span_id}/tree` - Get span tree
  - `POST /telemetry/spans` - Query spans
  - `POST /telemetry/metrics/{metric_name}` - Query metrics
 **Authentication Configuration:**
 For **JWT/OAuth2 providers**, scopes should be included in the JWT's claims:
 ```json
 {
  "sub": "user123",
  "scope": "telemetry.read",
  "aud": "llama-stack"
 }
 ```
 For **custom authentication providers**, the endpoint must return user attributes including the `scopes` array:
 ```json
 {
  "principal": "user123",
  "attributes": {
    "scopes": ["telemetry.read"]
  }
 }
 ```
 **Behavior:**
 - Users without the required scope receive a 403 Forbidden response
 - When authentication is disabled, scope checks are bypassed
 - Endpoints without `required_scope` work normally for all authenticated users
 ### Quota Configuration
 The `quota` section allows you to enable server-side request throttling for both
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -0,0 +1,40 @@
 # Customizing run.yaml Files
 The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
 ## Key Points
 - **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
 - **Customization expected**: Update URLs, credentials, models, and settings for your environment
 - **Version control separately**: Keep customized configs in your own repository
 - **Environment-specific**: Create different configurations for dev, staging, production
 ## What You Can Customize
 You can customize:
 - **Provider endpoints**: Change `http://localhost:8000` to your actual servers
 - **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
 - **Storage paths**: Move from `/tmp/` to production directories
 - **Authentication**: Add API keys, SSL, timeouts
 - **Models**: Different model sizes for dev vs prod
 - **Database settings**: Switch from SQLite to PostgreSQL
 - **Tool configurations**: Add custom tools and integrations
 ## Best Practices
 - Use environment variables for secrets and environment-specific values
 - Create separate `run.yaml` files for different environments (dev, staging, prod)
 - Document your changes with comments
 - Test configurations before deployment
 - Keep your customized configs in version control
 Example structure:
 ```
 your-project/
 ├── configs/
 │   ├── dev-run.yaml
 │   ├── prod-run.yaml
 └── README.md
 ```
 The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -6,14 +6,14 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template starter --image-type venv
+llama stack build --distro starter --image-type venv
 ```
 ```python
-from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+from llama_stack.core.library_client import LlamaStackAsLibraryClient
 client = LlamaStackAsLibraryClient(
-    "ollama",
+    "starter",
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -6,13 +6,10 @@ This section provides an overview of the distributions available in Llama Stack.
 ```{toctree}
 :maxdepth: 3
-
+list_of_distributions
 building_distro
 customizing_run_yaml
 starting_llama_stack_server
 importing_as_library
 configuration
 list_of_distributions
 kubernetes_deployment
 building_distro
 on_device_distro
 remote_hosted_distro
 self_hosted_distro
 ```
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -21,6 +21,24 @@ else
  exit 1
 fi
 if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
  echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
  exit 1
 fi
 if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
  echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
  exit 1
 fi
 if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
  echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
  exit 1
 fi
 set -euo pipefail
 set -x
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -34,6 +34,13 @@ data:
        provider_type: remote::chromadb
        config:
          url: ${env.CHROMADB_URL:=}
          kvstore:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -122,6 +129,9 @@ data:
      provider_id: rag-runtime
    server:
      port: 8321
      auth:
        provider_config:
          type: github_token
 kind: ConfigMap
 metadata:
  creationTimestamp: null
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -27,7 +27,7 @@ spec:
    spec:
      containers:
      - name: llama-stack
-        image: llamastack/distribution-remote-vllm:latest
+        image: llamastack/distribution-starter:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -31,6 +31,13 @@ providers:
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -119,3 +126,6 @@ tool_groups:
  provider_id: rag-runtime
 server:
  port: 8321
  auth:
    provider_config:
      type: github_token
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -26,6 +26,12 @@ spec:
          value: "http://llama-stack-service:8321"
        - name: LLAMA_STACK_UI_PORT
          value: "8322"
        - name: GITHUB_CLIENT_ID
          value: "${GITHUB_CLIENT_ID}"
        - name: GITHUB_CLIENT_SECRET
          value: "${GITHUB_CLIENT_SECRET}"
        - name: NEXTAUTH_URL
          value: "${LLAMA_STACK_UI_URL}:8322"
        args:
          - -c
          - |
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-conda create -n stack-fireworks python=3.10
+uv venv starter --python 3.12
-conda activate stack-fireworks
+source starter/bin/activate  # On Windows: starter\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --template fireworks --image-type conda
+llama stack build --distro starter --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
-llama stack run fireworks --port 5050
+llama stack run starter --port 5050
 ```
 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@ -57,7 +57,7 @@ Make sure you have access to a watsonx API Key. You can get one by referring [wa
 ## Running Llama Stack with watsonx
-You can do this via Conda (build code), venv or Docker which has a pre-built image.
+You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -76,13 +76,3 @@ docker run \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
 ```
 ### Via Conda
 ```bash
 llama stack build --template watsonx --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env WATSONX_API_KEY=$WATSONX_API_KEY \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
 ```
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -114,7 +114,7 @@ podman run --rm -it \
 ## Running Llama Stack
-Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -153,7 +153,7 @@ docker run \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
-  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-dell \
  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
@ -164,12 +164,12 @@ docker run \
  --env CHROMA_URL=$CHROMA_URL
 ```
-### Via Conda
+### Via venv
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
-llama stack build --template dell --image-type conda
+llama stack build --distro dell --image-type venv
 llama stack run dell
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -70,7 +70,7 @@ $ llama model list --downloaded
 ## Running the Distribution
-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -104,12 +104,12 @@ docker run \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
-### Via Conda
+### Via venv
 Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
-llama stack build --template meta-reference-gpu --image-type conda
+llama stack build --distro meta-reference-gpu --image-type venv
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 8321 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -1,3 +1,6 @@
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # NVIDIA Distribution
@ -37,16 +40,16 @@ The following environment variables can be configured:
 The following models are available by default:
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
+- `meta/llama3-8b-instruct `
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
+- `meta/llama3-70b-instruct `
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `meta/llama-3.1-8b-instruct `
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
+- `meta/llama-3.1-70b-instruct `
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `meta/llama-3.1-405b-instruct `
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `meta/llama-3.2-1b-instruct `
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `meta/llama-3.2-3b-instruct `
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `meta/llama-3.2-11b-vision-instruct `
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `meta/llama-3.2-90b-vision-instruct `
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `meta/llama-3.3-70b-instruct `
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
@ -130,7 +133,7 @@ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-inst
 ## Running Llama Stack with NVIDIA
-You can do this via Conda or venv (build code), or Docker which has a pre-built image.
+You can do this via venv (build code), or Docker which has a pre-built image.
 ### Via Docker
@ -149,24 +152,13 @@ docker run \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
 ### Via Conda
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
 ### Via venv
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-llama stack build --template nvidia --image-type venv
+llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -100,10 +100,6 @@ The following environment variables can be configured:
 ### Model Configuration
 - `INFERENCE_MODEL`: HuggingFace model for serverless inference
 - `INFERENCE_ENDPOINT_NAME`: HuggingFace endpoint name
 - `OLLAMA_INFERENCE_MODEL`: Ollama model name
 - `OLLAMA_EMBEDDING_MODEL`: Ollama embedding model name
 - `OLLAMA_EMBEDDING_DIMENSION`: Ollama embedding dimension (default: `384`)
 - `VLLM_INFERENCE_MODEL`: vLLM model name
 ### Vector Database Configuration
 - `SQLITE_STORE_DIR`: SQLite store directory (default: `~/.llama/distributions/starter`)
@ -127,47 +123,29 @@ The following environment variables can be configured:
 ## Enabling Providers
-You can enable specific providers by setting their provider ID to a valid value using environment variables. This is useful when you want to use certain providers or don't have the required API keys.
+You can enable specific providers by setting appropriate environment variables. For example,
 ### Examples of Enabling Providers
 #### Enable FAISS Vector Provider
 ```bash
-export ENABLE_FAISS=faiss
+# self-hosted
 export OLLAMA_URL=http://localhost:11434   # enables the Ollama inference provider
 export VLLM_URL=http://localhost:8000/v1   # enables the vLLM inference provider
 export TGI_URL=http://localhost:8000/v1   # enables the TGI inference provider
 # cloud-hosted requiring API key configuration on the server
 export CEREBRAS_API_KEY=your_cerebras_api_key   # enables the Cerebras inference provider
 export NVIDIA_API_KEY=your_nvidia_api_key   # enables the NVIDIA inference provider
 # vector providers
 export MILVUS_URL=http://localhost:19530   # enables the Milvus vector provider
 export CHROMADB_URL=http://localhost:8000/v1   # enables the ChromaDB vector provider
 export PGVECTOR_DB=llama_stack_db   # enables the PGVector vector provider
 ```
-#### Enable Ollama Models
+This distribution comes with a default "llama-guard" shield that can be enabled by setting the `SAFETY_MODEL` environment variable to point to an appropriate Llama Guard model id. Use `llama-stack-client models list` to see the list of available models.
 ```bash
 export ENABLE_OLLAMA=ollama
 ```
 #### Disable vLLM Models
 ```bash
 export VLLM_INFERENCE_MODEL=__disabled__
 ```
 #### Disable Optional Vector Providers
 ```bash
 export ENABLE_SQLITE_VEC=__disabled__
 export ENABLE_CHROMADB=__disabled__
 export ENABLE_PGVECTOR=__disabled__
 ```
 ### Provider ID Patterns
 The starter distribution uses several patterns for provider IDs:
 1. **Direct provider IDs**: `faiss`, `ollama`, `vllm`
 2. **Environment-based provider IDs**: `${env.ENABLE_SQLITE_VEC+sqlite-vec}`
 3. **Model-based provider IDs**: `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`
 When using the `+` pattern (like `${env.ENABLE_SQLITE_VEC+sqlite-vec}`), the provider is enabled by default and can be disabled by setting the environment variable to `__disabled__`.
 When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`), the provider is disabled by default and can be enabled by setting the environment variable to a valid value.
 ## Running the Distribution
-You can run the starter distribution via Docker or Conda.
+You can run the starter distribution via Docker or venv.
 ### Via Docker
@ -186,17 +164,12 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```
-### Via Conda
+### Via venv
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+Ensure you have configured the starter distribution using the environment variables explained above.
 ```bash
-llama stack build --template starter --image-type conda
+uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 llama stack run distributions/starter/run.yaml \
  --port 8321 \
  --env OPENAI_API_KEY=your_openai_key \
  --env FIREWORKS_API_KEY=your_fireworks_key \
  --env TOGETHER_API_KEY=your_together_key
 ```
 ## Example Usage
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -11,12 +11,6 @@ This is the simplest way to get started. Using Llama Stack as a library means yo
 Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
 ## Conda:
 If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
 ## Kubernetes:
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
@ -28,5 +22,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste
 importing_as_library
 configuration
 kubernetes_deployment
 ```
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -0,0 +1,67 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
 vector_db_id = "my_demo_vector_db"
 client = LlamaStackClient(base_url="http://localhost:8321")
 models = client.models.list()
 # Select the first LLM and first embedding models
 model_id = next(m for m in models if m.model_type == "llm").identifier
 embedding_model_id = (
    em := next(m for m in models if m.model_type == "embedding")
 ).identifier
 embedding_dimension = em.metadata["embedding_dimension"]
 _ = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="faiss",
 )
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
    document_id="document_1",
    content=source,
    mime_type="text/html",
    metadata={},
 )
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=50,
 )
 agent = Agent(
    client,
    model=model_id,
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
 )
 prompt = "How do you do great work?"
 print("prompt>", prompt)
 use_stream = True
 response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session"),
    stream=use_stream,
 )
 # Only call `AgentEventLogger().log(response)` for streaming responses.
 if use_stream:
    for log in AgentEventLogger().log(response):
        log.print()
 else:
    print(response)
--- a/Show more
+++ b/Show more
`@ -1 +1 @@`
	The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.	The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.