Merge branch 'main' into make-kvstore-optional

2025-08-15 14:08:00 +00:00 · 2025-08-05 14:10:30 -04:00 · 2025-08-05 14:10:30 -04:00 · f62e6cb063
commit f62e6cb063
parent 4e566276a5 e12524af85
554 changed files with 63962 additions and 4870 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -0,0 +1,82 @@
 name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
  test-types:
    description: 'JSON array of test types to run'
    required: true
  stack-config:
    description: 'Stack configuration to use'
    required: true
  provider:
    description: 'Provider to use for tests'
    required: true
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
  run-vision-tests:
    description: 'Whether to run vision tests'
    required: false
    default: 'false'
 runs:
  using: 'composite'
  steps:
    - name: Check Storage and Memory Available Before Tests
      if: ${{ always() }}
      shell: bash
      run: |
        free -h
        df -h
    - name: Run Integration Tests
      shell: bash
      run: |
        ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
          --test-types '${{ inputs.test-types }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
    - name: Commit and push recordings
      if: ${{ inputs.inference-mode == 'record' }}
      shell: bash
      run: |
        echo "Checking for recording changes"
        git status --porcelain tests/integration/recordings/
        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
          echo "New recordings detected, committing and pushing"
          git add tests/integration/recordings/
          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
            git commit -m "Recordings update from CI (vision)"
          else
            git commit -m "Recordings update from CI"
          fi
          git fetch origin ${{ github.event.pull_request.head.ref }}
          git rebase origin/${{ github.event.pull_request.head.ref }}
          echo "Rebased successfully"
          git push origin HEAD:${{ github.event.pull_request.head.ref }}
          echo "Pushed successfully"
        else
          echo "No recording changes"
        fi
    - name: Write inference logs to file
      if: ${{ always() }}
      shell: bash
      run: |
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
    - name: Upload logs
      if: ${{ always() }}
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
        path: |
          *.log
        retention-days: 1
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,11 +1,23 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
  run-vision-tests:
    description: 'Run vision tests: "true" or "false"'
    required: false
    default: 'false'
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
+        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
          image="ollama-with-vision-model"
        else
          image="ollama-with-models"
        fi
        echo "Starting Ollama with image: $image"
        docker run -d --name ollama -p 11434:11434 docker.io/llamastack/$image
        echo "Verifying Ollama status..."
        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -0,0 +1,51 @@
 name: 'Setup Test Environment'
 description: 'Common setup steps for integration tests including dependencies, providers, and build'
 inputs:
  python-version:
    description: 'Python version to use'
    required: true
  client-version:
    description: 'Client version (latest or published)'
    required: true
  provider:
    description: 'Provider to setup (ollama or vllm)'
    required: true
    default: 'ollama'
  run-vision-tests:
    description: 'Whether to setup provider for vision tests'
    required: false
    default: 'false'
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
 runs:
  using: 'composite'
  steps:
    - name: Install dependencies
      uses: ./.github/actions/setup-runner
      with:
        python-version: ${{ inputs.python-version }}
        client-version: ${{ inputs.client-version }}
    - name: Setup ollama
      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-ollama
      with:
        run-vision-tests: ${{ inputs.run-vision-tests }}
    - name: Setup vllm
      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-vllm
    - name: Build Llama Stack
      shell: bash
      run: |
        uv run llama stack build --template ci-tests --image-type venv
    - name: Configure git for commits
      shell: bash
      run: |
        git config --local user.email "github-actions[bot]@users.noreply.github.com"
        git config --local user.name "github-actions[bot]"
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -1,19 +1,19 @@
 # Llama Stack CI
-Llama Stack uses GitHub Actions for Continous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
+Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | Coverage Badge | [coverage-badge.yml](coverage-badge.yml) | Creates PR for updating the code coverage badge |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests | [integration-tests.yml](integration-tests.yml) | Run the integration test suite with Ollama |
+| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
--- a/.github/workflows/coverage-badge.yml
+++ b/.github/workflows/coverage-badge.yml
@ -1,62 +0,0 @@
 name: Coverage Badge
 run-name: Creates PR for updating the code coverage badge
 on:
  push:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/unit-tests.yml'
      - '.github/workflows/coverage-badge.yml' # This workflow
  workflow_dispatch:
 jobs:
  unit-tests:
    permissions:
      contents: write  # for peter-evans/create-pull-request to create branch
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Run unit tests
        run: |
          ./scripts/unit-tests.sh
      - name: Coverage Badge
        uses: tj-actions/coverage-badge-py@1788babcb24544eb5bbb6e0d374df5d1e54e670f # v2.0.4
      - name: Verify Changed files
        uses: tj-actions/verify-changed-files@a1c6acee9df209257a246f2cc6ae8cb6581c1edf # v20.0.4
        id: verify-changed-files
        with:
          files: coverage.svg
      - name: Commit files
        if: steps.verify-changed-files.outputs.files_changed == 'true'
        run: |
          git config --local user.email "github-actions[bot]@users.noreply.github.com"
          git config --local user.name "github-actions[bot]"
          git add coverage.svg
          git commit -m "Updated coverage.svg"
      - name: Create Pull Request
        if: steps.verify-changed-files.outputs.files_changed == 'true'
        uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          title: "ci: [Automatic] Coverage Badge Update"
          body: |
            This PR updates the coverage badge based on the latest coverage report.
            Automatically generated by the [workflow coverage-badge.yaml](.github/workflows/coverage-badge.yaml)
          delete-branch: true
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,20 +1,22 @@
-name: Integration Tests
+name: Integration Tests (Replay)
-run-name: Run the integration test suite with Ollama
+run-name: Run the integration test suite from tests/integration in replay mode
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
@ -31,129 +33,64 @@ on:
        default: 'ollama'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  # Skip concurrency for pushes to main - each commit should be tested independently
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
  discover-tests:
    runs-on: ubuntu-latest
    outputs:
-      test-type: ${{ steps.generate-matrix.outputs.test-type }}
+      test-types: ${{ steps.generate-test-types.outputs.test-types }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Generate test matrix
+      - name: Generate test types
-        id: generate-matrix
+        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          # NOTE: we are excluding post_training since the tests take too long
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases)$" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
+          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
-  test-matrix:
+  run-replay-mode-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
    strategy:
      fail-fast: false
      matrix:
        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
        client-type: [library, server]
        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
-        python-version: ["3.12", "3.13"]
+        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
-        client-version: ${{ (github.event.schedule == '0 0 * * 0' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
+        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
-        exclude: # TODO: look into why these tests are failing and fix them
+        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-          - provider: vllm
+        run-vision-tests: [true, false]
            test-type: safety
          - provider: vllm
            test-type: post_training
          - provider: vllm
            test-type: tool_runtime
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Install dependencies
+      - name: Setup test environment
-        uses: ./.github/actions/setup-runner
+        uses: ./.github/actions/setup-test-environment
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
          provider: ${{ matrix.provider }}
          run-vision-tests: ${{ matrix.run-vision-tests }}
          inference-mode: 'replay'
-      - name: Setup ollama
+      - name: Run tests
-        if: ${{ matrix.provider == 'ollama' }}
+        uses: ./.github/actions/run-and-record-tests
        uses: ./.github/actions/setup-ollama
      - name: Setup vllm
        if: ${{ matrix.provider == 'vllm' }}
        uses: ./.github/actions/setup-vllm
      - name: Build Llama Stack
        run: |
          uv run llama stack build --template ci-tests --image-type venv
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Run Integration Tests
        env:
          LLAMA_STACK_CLIENT_TIMEOUT: "300" # Increased timeout for eval operations
        # Use 'shell' to get pipefail behavior
        # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
        # TODO: write a precommit hook to detect if a test contains a pipe but does not use 'shell: bash'
        shell: bash
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="ci-tests"
          else
            stack_config="server:ci-tests"
          fi
          EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
          if [ "${{ matrix.provider }}" == "ollama" ]; then
            export OLLAMA_URL="http://0.0.0.0:11434"
            export TEXT_MODEL=ollama/llama3.2:3b-instruct-fp16
            export SAFETY_MODEL="ollama/llama-guard3:1b"
            EXTRA_PARAMS="--safety-shield=llama-guard"
          else
            export VLLM_URL="http://localhost:8000/v1"
            export TEXT_MODEL=vllm/meta-llama/Llama-3.2-1B-Instruct
            # TODO: remove the not(test_inference_store_tool_calls) once we can get the tool called consistently
            EXTRA_PARAMS=
            EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not( ${EXCLUDE_TESTS} )" \
            --text-model=$TEXT_MODEL \
            --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
            --color=yes ${EXTRA_PARAMS} \
            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Write inference logs to file
        if: ${{ always() }}
        run: |
          sudo docker logs ollama > ollama.log || true
          sudo docker logs vllm > vllm.log || true
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.provider }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
-          path: |
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-            *.log
+          provider: ${{ matrix.provider }}
-          retention-days: 1
+          inference-mode: 'replay'
          run-vision-tests: ${{ matrix.run-vision-tests }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector"]
+        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
        python-version: ["3.12", "3.13"]
      fail-fast: false # we want to run all tests regardless of failure
@ -48,6 +48,14 @@ jobs:
            -e ANONYMIZED_TELEMETRY=FALSE \
            chromadb/chroma:latest
      - name: Setup Weaviate
        if: matrix.vector-io-provider == 'remote::weaviate'
        run: |
          docker run --rm -d --pull always \
          --name weaviate \
          -p 8080:8080 -p 50051:50051 \
          cr.weaviate.io/semitechnologies/weaviate:1.32.0
      - name: Start PGVector DB
        if: matrix.vector-io-provider == 'remote::pgvector'
        run: |
@ -78,6 +86,29 @@ jobs:
          PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
            -c "CREATE EXTENSION IF NOT EXISTS vector;"
      - name: Setup Qdrant
        if: matrix.vector-io-provider == 'remote::qdrant'
        run: |
          docker run --rm -d --pull always \
            --name qdrant \
            -p 6333:6333 \
            qdrant/qdrant
      - name: Wait for Qdrant to be ready
        if: matrix.vector-io-provider == 'remote::qdrant'
        run: |
          echo "Waiting for Qdrant to be ready..."
          for i in {1..30}; do
            if curl -s http://localhost:6333/collections | grep -q '"status":"ok"'; then
              echo "Qdrant is ready!"
              exit 0
            fi
            sleep 2
          done
          echo "Qdrant failed to start"
          docker logs qdrant
          exit 1
      - name: Wait for ChromaDB to be ready
        if: matrix.vector-io-provider == 'remote::chromadb'
        run: |
@ -93,6 +124,21 @@ jobs:
          docker logs chromadb
          exit 1
      - name: Wait for Weaviate to be ready
        if: matrix.vector-io-provider == 'remote::weaviate'
        run: |
          echo "Waiting for Weaviate to be ready..."
          for i in {1..30}; do
            if curl -s http://localhost:8080 | grep -q "https://weaviate.io/developers/weaviate/current/"; then
              echo "Weaviate is ready!"
              exit 0
            fi
            sleep 2
          done
          echo "Weaviate failed to start"
          docker logs weaviate
          exit 1
      - name: Build Llama Stack
        run: |
          uv run llama stack build --template ci-tests --image-type venv
@ -113,6 +159,10 @@ jobs:
          PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
          ENABLE_QDRANT: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'true' || '' }}
          QDRANT_URL: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'http://localhost:6333' || '' }}
          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
          uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
@ -134,6 +184,11 @@ jobs:
        run: |
          docker logs chromadb > chromadb.log
      - name: Write Qdrant logs to file
        if: ${{ always() && matrix.vector-io-provider == 'remote::qdrant' }}
        run: |
          docker logs qdrant > qdrant.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -14,10 +14,18 @@ concurrency:
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - name: Checkout code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
          # Fetch full history for dependabot PRs to allow commits
          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@ -29,15 +37,45 @@ jobs:
            .pre-commit-config.yaml
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Debug
        run: |
          echo "github.ref: ${{ github.ref }}"
          echo "github.actor: ${{ github.actor }}"
      - name: Commit changes for dependabot PRs
        if: github.actor == 'dependabot[bot]'
        run: |
          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
            git config --local user.email "github-actions[bot]@users.noreply.github.com"
            git config --local user.name "github-actions[bot]"
            # Ensure we're on the correct branch
            git checkout -B ${{ github.head_ref }}
            git add -A
            git commit -m "Apply pre-commit fixes"
            # Pull latest changes from the PR branch and rebase our commit on top
            git pull --rebase origin ${{ github.head_ref }}
            # Push to the PR branch
            git push origin ${{ github.head_ref }}
            echo "Pre-commit fixes committed and pushed"
          else
            echo "No changes to commit"
          fi
      - name: Verify if there are any diff files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -9,20 +9,20 @@ on:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/core/build.*'
-      - 'llama_stack/distribution/*.sh'
+      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/templates/**'
+      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/core/build.*'
-      - 'llama_stack/distribution/*.sh'
+      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/templates/**'
+      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
 concurrency:
@ -33,23 +33,23 @@ jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
-      templates: ${{ steps.set-matrix.outputs.templates }}
+      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Generate Template List
+      - name: Generate Distribution List
        id: set-matrix
        run: |
-          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
@ -62,13 +62,13 @@ jobs:
      - name: Print build dependencies
        run: |
-          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -99,16 +99,16 @@ jobs:
      - name: Build a single provider
        run: |
-          yq -i '.image_type = "container"' llama_stack/templates/ci-tests/build.yaml
+          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
-          yq -i '.image_name = "test"' llama_stack/templates/ci-tests/build.yaml
+          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -122,27 +122,27 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Pin template to UBI9 base
+      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/templates/ci-tests/build.yaml
+          ' llama_stack/distributions/ci-tests/build.yaml
      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml
+          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -0,0 +1,109 @@
 name: Integration Tests (Record)
 run-name: Run the integration test suite from tests/integration
 on:
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, labeled]
    paths:
      - 'llama_stack/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/record-integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  discover-tests:
    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
          echo "labels=$labels"
          modes_array=()
          if [[ $labels == *"re-record-vision-tests"* ]]; then
            modes_array+=("vision")
          fi
          if [[ $labels == *"re-record-tests"* ]]; then
            modes_array+=("non-vision")
          fi
          # Convert to JSON array
          if [ ${#modes_array[@]} -eq 0 ]; then
            matrix_modes="[]"
          else
            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
          fi
          echo "matrix_modes=$matrix_modes"
          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
  record-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    permissions:
      contents: write
    strategy:
      fail-fast: false
      matrix:
        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-types: ${{ needs.discover-tests.outputs.test-types }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -12,11 +12,13 @@ on:
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
-      - 'requirements.txt'
+      - 'tests/external/*'
      - '.github/workflows/test-external-provider-module.yml' # This workflow
 jobs:
  test-external-providers-from-module:
    # This workflow is disabled. See https://github.com/meta-llama/llama-stack/pull/2975#issuecomment-3138702984 for details
    if: false
    runs-on: ubuntu-latest
    strategy:
      matrix:
@ -46,7 +48,7 @@ jobs:
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/ramalama-stack/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -13,6 +13,7 @@ on:
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - 'tests/external/*'
      - '.github/workflows/test-external.yml' # This workflow
 jobs:
@ -42,11 +43,11 @@ jobs:
      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/build.yaml --print-deps-only
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -35,6 +35,8 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python }}
      - name: Run unit tests
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -19,7 +19,6 @@ repos:
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
@ -56,14 +55,6 @@ repos:
    rev: 0.7.20
    hooks:
    -   id: uv-lock
    -   id: uv-export
        args: [
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.16.1
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -451,7 +451,7 @@ GenAI application developers need more than just an LLM - they need to integrate
 Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
+With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
 ## Release
 After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -164,7 +164,7 @@ Some tips about common tasks you work on while contributing to Llama Stack:
 ### Using `llama stack build`
-Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
 Example:
 ```bash
@ -172,7 +172,7 @@ cd work/
 git clone https://github.com/meta-llama/llama-stack.git
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```
 ### Updating distribution configurations
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,9 @@
 include pyproject.toml
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/distribution/*.sh
+include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
-include llama_stack/templates/*/*.yaml
+include llama_stack/distributions/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -6,7 +6,6 @@
 [![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
 ![coverage badge](./coverage.svg)
 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
@ -112,29 +111,33 @@ Here is a list of the various API providers and available distributions that can
 Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
 | API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
-|:-------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
+|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
-| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| SambaNova | Hosted | | ✅ | | ✅ | | | | |
+|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
-| Cerebras | Hosted | | ✅ | | | | | | |
+|       Cerebras       | Hosted | | ✅ | | | | | | |
-| Fireworks | Hosted | ✅ | ✅ | ✅ | | | | | |
+|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
-| AWS Bedrock | Hosted | | ✅ | | ✅ | | | | |
+|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
-| Together | Hosted | ✅ | ✅ | | ✅ | | | | |
+|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
-| Groq | Hosted | | ✅ | | | | | | |
+|         Groq         | Hosted | | ✅ | | | | | | |
-| Ollama | Single Node | | ✅ | | | | | | |
+|        Ollama        | Single Node | | ✅ | | | | | | |
-| TGI | Hosted/Single Node | | ✅ | | | | | | |
+|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
-| NVIDIA NIM | Hosted/Single Node | | ✅ | | ✅ | | | | |
+|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
-| ChromaDB | Hosted/Single Node | | | ✅ | | | | | |
+|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
-| PG Vector | Single Node | | | ✅ | | | | | |
+|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
-| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | | |
+|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
-| vLLM | Single Node | | ✅ | | | | | | |
+|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
-| OpenAI | Hosted | | ✅ | | | | | | |
+|      SQLite-vec      | Single Node | | | ✅ | | | | | |
-| Anthropic | Hosted | | ✅ | | | | | | |
+|      PG Vector       | Single Node | | | ✅ | | | | | |
-| Gemini | Hosted | | ✅ | | | | | | |
+|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
-| WatsonX | Hosted | | ✅ | | | | | | |
+|         vLLM         | Single Node | | ✅ | | | | | | |
-| HuggingFace | Single Node | | | | | | ✅ | | ✅ |
+|        OpenAI        | Hosted | | ✅ | | | | | | |
-| TorchTune | Single Node | | | | | | ✅ | | |
+|      Anthropic       | Hosted | | ✅ | | | | | | |
-| NVIDIA NEMO | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
+|        Gemini        | Hosted | | ✅ | | | | | | |
-| NVIDIA | Hosted | | | | | | ✅ | ✅ | ✅ |
+|       WatsonX        | Hosted | | ✅ | | | | | | |
 |     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
 |      TorchTune       | Single Node | | | | | | ✅ | | |
 |     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
 |        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
 > **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -123,7 +123,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
-        "!uv run --with llama-stack llama stack build --template together --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --distro together --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -165,7 +165,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -233,7 +233,7 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server \n",
-        "!uv run --with llama-stack llama stack build --template meta-reference-gpu --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv \n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -275,7 +275,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@ -223,7 +223,7 @@
          "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
          "\n",
          "# this command installs all the dependencies needed for the llama stack server \n",
-          "!uv run --with llama-stack llama stack build --template llama_api --image-type venv \n",
+          "!uv run --with llama-stack llama stack build --distro llama_api --image-type venv \n",
          "\n",
          "def run_llama_stack_server_background():\n",
          "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -265,7 +265,7 @@
          "# use this helper if needed to kill the server \n",
          "def kill_llama_stack_server():\n",
          "    # Kill any existing llama stack server processes\n",
-          "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+          "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
        ]
      },
      {
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -37,7 +37,7 @@
        "\n",
        "To learn more about torchtune: https://github.com/pytorch/torchtune\n",
        "\n",
-        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/templates/experimental-post-training) as the distribution template\n",
+        "We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/experimental-post-training) as the distribution template\n",
        "\n",
        "####  0.0. Prerequisite: Have an OpenAI API key\n",
        "In this showcase, we will use [braintrust](https://www.braintrust.dev/) as scoring provider for eval and it uses OpenAI model as judge model for scoring. So, you need to get an API key from [OpenAI developer platform](https://platform.openai.com/docs/overview).\n",
@ -2864,7 +2864,7 @@
        }
      ],
      "source": [
-        "!llama stack build --template experimental-post-training --image-type venv --image-name __system__"
+        "!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
      ]
    },
    {
@ -3216,19 +3216,19 @@
            "INFO:datasets:Duckdb version 1.1.3 available.\n",
            "INFO:datasets:TensorFlow version 2.18.0 available.\n",
            "INFO:datasets:JAX version 0.4.33 available.\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::equality served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::equality served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::subset_of served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::subset_of served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
+            "INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
-            "INFO:llama_stack.distribution.stack:\n"
+            "INFO:llama_stack.core.stack:\n"
          ]
        },
        {
@ -3448,7 +3448,7 @@
        "\n",
        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
        "\n",
-        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
        "client = LlamaStackAsLibraryClient(\"experimental-post-training\")\n",
        "_ = client.initialize()"
      ]
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -38,7 +38,7 @@
   "source": [
    "# NBVAL_SKIP\n",
    "!pip install -U llama-stack\n",
-    "!UV_SYSTEM_PYTHON=1 llama stack build --template fireworks --image-type venv"
+    "!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
   ]
  },
  {
@ -48,7 +48,7 @@
   "outputs": [],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -57,7 +57,7 @@
      "outputs": [],
      "source": [
        "# NBVAL_SKIP\n",
-        "!UV_SYSTEM_PYTHON=1 llama stack build --template together --image-type venv"
+        "!UV_SYSTEM_PYTHON=1 llama stack build --distro together --image-type venv"
      ]
    },
    {
@ -661,7 +661,7 @@
        "except ImportError:\n",
        "    print(\"Not in Google Colab environment\")\n",
        "\n",
-        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+        "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
        "\n",
        "client = LlamaStackAsLibraryClient(\"together\")\n",
        "_ = client.initialize()"
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -35,7 +35,7 @@
   ],
   "source": [
    "from llama_stack_client import LlamaStackClient, Agent\n",
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -92,7 +92,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
@ -194,7 +194,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client =  LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
@ -81,7 +81,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/3_model_evaluation.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
@ -56,7 +56,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
    "\n",
    "client = LlamaStackAsLibraryClient(\"nvidia\")\n",
    "client.initialize()"
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1 +1 @@
-The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
+The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -17,7 +17,7 @@ import fire
 import ruamel.yaml as yaml
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION  # noqa: E402
-from llama_stack.distribution.stack import LlamaStack  # noqa: E402
+from llama_stack.core.stack import LlamaStack  # noqa: E402
 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -12,7 +12,7 @@ from typing import TextIO
 from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
-from llama_stack.distribution.resolver import api_protocol_map
+from llama_stack.core.resolver import api_protocol_map
 from .generator import Generator
 from .options import Options
--- a/docs/original_rfc.md
+++ b/docs/original_rfc.md
@ -73,7 +73,7 @@ The API is defined in the [YAML](_static/llama-stack-spec.yaml) and [HTML](_stat
 To prove out the API, we implemented a handful of use cases to make things more concrete. The [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps) repository contains [6 different examples](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) ranging from very basic to a multi turn agent.
-There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/distribution/server/server.py) repository.
+There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack.core/server/server.py) repository.
 ## Limitations
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -145,12 +145,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-        "!uv run --with llama-stack llama stack build --template starter --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"uv run --with llama-stack llama stack run starter --image-type venv --env INFERENCE_MODEL=llama3.2:3b\",\n",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
@ -187,7 +187,7 @@
        "# use this helper if needed to kill the server \n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
-        "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
--- a/docs/source/advanced_apis/eval/inline_meta-reference.md
+++ b/docs/source/advanced_apis/eval/inline_meta-reference.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::meta-reference
 ## Description
--- a/docs/source/advanced_apis/eval/remote_nvidia.md
+++ b/docs/source/advanced_apis/eval/remote_nvidia.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # remote::nvidia
 ## Description
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
@ -43,7 +43,7 @@ We have built-in functionality to run the supported open-benckmarks using llama-
 Spin up llama stack server with 'open-benchmark' template
 ```
-llama stack run llama_stack/templates/open-benchmark/run.yaml
+llama stack run llama_stack/distributions/open-benchmark/run.yaml
 ```
--- a/docs/source/advanced_apis/post_training/huggingface.md
+++ b/docs/source/advanced_apis/post_training/huggingface.md
@ -23,7 +23,7 @@ To use the HF SFTTrainer in your Llama Stack project, follow these steps:
 You can access the HuggingFace trainer via the `ollama` distribution:
 ```bash
-llama stack build --template starter --image-type venv
+llama stack build --distro starter --image-type venv
 llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
 ```
--- a/docs/source/advanced_apis/post_training/inline_huggingface.md
+++ b/docs/source/advanced_apis/post_training/inline_huggingface.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::huggingface
 ## Description
--- a/docs/source/advanced_apis/post_training/inline_torchtune.md
+++ b/docs/source/advanced_apis/post_training/inline_torchtune.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::torchtune
 ## Description
--- a/docs/source/advanced_apis/post_training/remote_nvidia.md
+++ b/docs/source/advanced_apis/post_training/remote_nvidia.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # remote::nvidia
 ## Description
--- a/docs/source/advanced_apis/scoring/inline_basic.md
+++ b/docs/source/advanced_apis/scoring/inline_basic.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::basic
 ## Description
--- a/docs/source/advanced_apis/scoring/inline_braintrust.md
+++ b/docs/source/advanced_apis/scoring/inline_braintrust.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::braintrust
 ## Description
--- a/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
+++ b/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
@ -1,3 +1,7 @@
 ---
 orphan: true
 ---
 # inline::llm-as-judge
 ## Description
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -355,7 +355,7 @@ server:
 8. Run the server:
 ```bash
-python -m llama_stack.distribution.server.server --yaml-config ~/.llama/run-byoa.yaml
+python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
 ```
 9. Test the API:
--- a/docs/source/building_applications/playground/index.md
+++ b/docs/source/building_applications/playground/index.md
@ -97,11 +97,11 @@ To start the Llama Stack Playground, run the following commands:
 1. Start up the Llama Stack API server
 ```bash
-llama stack build --template together --image-type conda
+llama stack build --distro together --image-type venv
 llama stack run together
 ```
 2. Start Streamlit UI
 ```bash
-uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
+uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -11,4 +11,5 @@ See the [Adding a New API Provider](new_api_provider.md) which describes how to
 :hidden:
 new_api_provider
 testing
 ```
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 Here are some example PRs to help you get started:
@ -52,7 +52,7 @@ def get_base_url(self) -> str:
 ## Testing the Provider
-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
 ### 1. Integration Testing
--- a/docs/source/deploying/kubernetes_deployment.md
+++ b/docs/source/deploying/kubernetes_deployment.md
@ -174,7 +174,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -47,26 +47,26 @@ pip install -e .
 ```
 Use the CLI to build your distribution.
 The main points to consider are:
-1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
+1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
 2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
 3. **Config** - Do you want to use a pre-existing config file to build your distribution?
 ```
 llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
 Build a Llama stack container
 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will
                        be prompted to enter information interactively (default: None)
  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
-  --image-type {conda,container,venv}
+  --image-type {container,venv}
                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
  --image-name IMAGE_NAME
-                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active environment will be used if
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if
                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
@ -141,7 +141,7 @@ You may then pick a template to build your distribution with providers fitted to
 For example, to build a distribution with TGI as the inference provider, you can run:
 ```
-$ llama stack build --template starter
+$ llama stack build --distro starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
@ -159,7 +159,7 @@ It would be best to start with a template and understand the structure of the co
 llama stack build
 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
+> Enter the image type you want your Llama Stack to be built as (container or venv): venv
 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
@ -184,10 +184,10 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 :::{tab-item} Building from a pre-existing build config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.
+- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
 ```
-llama stack build --config llama_stack/templates/starter/build.yaml
+llama stack build --config llama_stack/distributions/starter/build.yaml
 ```
 :::
@ -253,11 +253,11 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
 ```
-llama stack build --template starter --image-type container
+llama stack build --distro starter --image-type container
 ```
 ```
-$ llama stack build --template starter --image-type container
+$ llama stack build --distro starter --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
@ -312,7 +312,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 ```
 llama stack run -h
 usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
-                       [--image-type {conda,venv}] [--enable-ui]
+                       [--image-type {venv}] [--enable-ui]
                       [config | template]
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
@ -326,8 +326,8 @@ options:
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current environment (default: None)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
-  --image-type {conda,venv}
+  --image-type {venv}
-                        Image Type used during the build. This can be either conda or venv. (default: None)
+                        Image Type used during the build. This should be venv. (default: None)
  --enable-ui           Start the UI server (default: False)
 ```
@ -342,9 +342,6 @@ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-
 # Start using a venv
 llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 # Start using a conda environment
 llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```
 ```
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -10,7 +10,6 @@ The default `run.yaml` files generated by templates are starting points for your
 ```yaml
 version: 2
 conda_env: ollama
 apis:
 - agents
 - inference
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -6,14 +6,14 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template starter --image-type venv
+llama stack build --distro starter --image-type venv
 ```
 ```python
-from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+from llama_stack.core.library_client import LlamaStackAsLibraryClient
 client = LlamaStackAsLibraryClient(
-    "ollama",
+    "starter",
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -9,6 +9,7 @@ This section provides an overview of the distributions available in Llama Stack.
 list_of_distributions
 building_distro
 customizing_run_yaml
 starting_llama_stack_server
 importing_as_library
 configuration
 ```
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -34,6 +34,13 @@ data:
        provider_type: remote::chromadb
        config:
          url: ${env.CHROMADB_URL:=}
          kvstore:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -31,6 +31,13 @@ providers:
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -56,10 +56,10 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-conda create -n stack-fireworks python=3.10
+python -m venv stack-fireworks
-conda activate stack-fireworks
+source stack-fireworks/bin/activate  # On Windows: stack-fireworks\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --template fireworks --image-type conda
+llama stack build --distro fireworks --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run fireworks --port 5050
 ```
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@ -57,7 +57,7 @@ Make sure you have access to a watsonx API Key. You can get one by referring [wa
 ## Running Llama Stack with watsonx
-You can do this via Conda (build code), venv or Docker which has a pre-built image.
+You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -76,13 +76,3 @@ docker run \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
 ```
 ### Via Conda
 ```bash
 llama stack build --template watsonx --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env WATSONX_API_KEY=$WATSONX_API_KEY \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
 ```
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -114,7 +114,7 @@ podman run --rm -it \
 ## Running Llama Stack
-Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -153,7 +153,7 @@ docker run \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
-  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-dell \
  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
@ -164,12 +164,12 @@ docker run \
  --env CHROMA_URL=$CHROMA_URL
 ```
-### Via Conda
+### Via venv
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
-llama stack build --template dell --image-type conda
+llama stack build --distro dell --image-type venv
 llama stack run dell
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -70,7 +70,7 @@ $ llama model list --downloaded
 ## Running the Distribution
-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via venv or Docker which has a pre-built image.
 ### Via Docker
@ -104,12 +104,12 @@ docker run \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
-### Via Conda
+### Via venv
 Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
-llama stack build --template meta-reference-gpu --image-type conda
+llama stack build --distro meta-reference-gpu --image-type venv
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 8321 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -133,7 +133,7 @@ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-inst
 ## Running Llama Stack with NVIDIA
-You can do this via Conda or venv (build code), or Docker which has a pre-built image.
+You can do this via venv (build code), or Docker which has a pre-built image.
 ### Via Docker
@ -152,24 +152,13 @@ docker run \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
 ### Via Conda
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
 ### Via venv
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type venv
+llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -100,10 +100,6 @@ The following environment variables can be configured:
 ### Model Configuration
 - `INFERENCE_MODEL`: HuggingFace model for serverless inference
 - `INFERENCE_ENDPOINT_NAME`: HuggingFace endpoint name
 - `OLLAMA_INFERENCE_MODEL`: Ollama model name
 - `OLLAMA_EMBEDDING_MODEL`: Ollama embedding model name
 - `OLLAMA_EMBEDDING_DIMENSION`: Ollama embedding dimension (default: `384`)
 - `VLLM_INFERENCE_MODEL`: vLLM model name
 ### Vector Database Configuration
 - `SQLITE_STORE_DIR`: SQLite store directory (default: `~/.llama/distributions/starter`)
@ -127,47 +123,29 @@ The following environment variables can be configured:
 ## Enabling Providers
-You can enable specific providers by setting their provider ID to a valid value using environment variables. This is useful when you want to use certain providers or don't have the required API keys.
+You can enable specific providers by setting appropriate environment variables. For example,
 ### Examples of Enabling Providers
 #### Enable FAISS Vector Provider
 ```bash
-export ENABLE_FAISS=faiss
+# self-hosted
 export OLLAMA_URL=http://localhost:11434   # enables the Ollama inference provider
 export VLLM_URL=http://localhost:8000/v1   # enables the vLLM inference provider
 export TGI_URL=http://localhost:8000/v1   # enables the TGI inference provider
 # cloud-hosted requiring API key configuration on the server
 export CEREBRAS_API_KEY=your_cerebras_api_key   # enables the Cerebras inference provider
 export NVIDIA_API_KEY=your_nvidia_api_key   # enables the NVIDIA inference provider
 # vector providers
 export MILVUS_URL=http://localhost:19530   # enables the Milvus vector provider
 export CHROMADB_URL=http://localhost:8000/v1   # enables the ChromaDB vector provider
 export PGVECTOR_DB=llama_stack_db   # enables the PGVector vector provider
 ```
-#### Enable Ollama Models
+This distribution comes with a default "llama-guard" shield that can be enabled by setting the `SAFETY_MODEL` environment variable to point to an appropriate Llama Guard model id. Use `llama-stack-client models list` to see the list of available models.
 ```bash
 export ENABLE_OLLAMA=ollama
 ```
 #### Disable vLLM Models
 ```bash
 export VLLM_INFERENCE_MODEL=__disabled__
 ```
 #### Disable Optional Vector Providers
 ```bash
 export ENABLE_SQLITE_VEC=__disabled__
 export ENABLE_CHROMADB=__disabled__
 export ENABLE_PGVECTOR=__disabled__
 ```
 ### Provider ID Patterns
 The starter distribution uses several patterns for provider IDs:
 1. **Direct provider IDs**: `faiss`, `ollama`, `vllm`
 2. **Environment-based provider IDs**: `${env.ENABLE_SQLITE_VEC:+sqlite-vec}`
 3. **Model-based provider IDs**: `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`
 When using the `+` pattern (like `${env.ENABLE_SQLITE_VEC+sqlite-vec}`), the provider is enabled by default and can be disabled by setting the environment variable to `__disabled__`.
 When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`), the provider is disabled by default and can be enabled by setting the environment variable to a valid value.
 ## Running the Distribution
-You can run the starter distribution via Docker, Conda, or venv.
+You can run the starter distribution via Docker or venv.
 ### Via Docker
@ -186,12 +164,12 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```
-### Via Conda or venv
+### Via venv
 Ensure you have configured the starter distribution using the environment variables explained above.
 ```bash
-uv run --with llama-stack llama stack build --template starter --image-type <conda|venv> --run
+uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```
 ## Example Usage
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -11,12 +11,6 @@ This is the simplest way to get started. Using Llama Stack as a library means yo
 Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
 ## Conda:
 If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
 ## Kubernetes:
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -59,10 +59,10 @@ Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
 ```bash
-llama stack build --template starter --image-type venv --run
+llama stack build --distro starter --image-type venv --run
 ```
 :::
-:::{tab-item} Using `conda`
+:::{tab-item} Using `venv`
 You can use Python to build and run the Llama Stack server, which is useful for testing and development.
 Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
@ -70,7 +70,7 @@ which defines the providers and their settings.
 Now let's build and run the Llama Stack config for Ollama.
 ```bash
-llama stack build --template starter --image-type conda --run
+llama stack build --distro starter --image-type venv --run
 ```
 :::
 :::{tab-item} Using a Container
@ -150,10 +150,10 @@ pip install llama-stack-client
 ```
 :::
-:::{tab-item} Install with `conda`
+:::{tab-item} Install with `venv`
 ```bash
-yes | conda create -n stack-client python=3.12
+python -m venv stack-client
-conda activate stack-client
+source stack-client/bin/activate  # On Windows: stack-client\Scripts\activate
 pip install llama-stack-client
 ```
 :::
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -16,10 +16,13 @@ as the inference [provider](../providers/inference/index) for a Llama Model.
 ```bash
 ollama run llama3.2:3b --keepalive 60m
 ```
 #### Step 2: Run the Llama Stack server
 We will use `uv` to run the Llama Stack server.
 ```bash
-uv run --with llama-stack llama stack build --template starter --image-type venv --run
+OLLAMA_URL=http://localhost:11434 \
  uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.
--- a/docs/source/providers/agents/index.md
+++ b/docs/source/providers/agents/index.md
@ -1,5 +1,13 @@
-# Agents Providers
+# Agents 
 ## Overview
 This section contains documentation for all available providers for the **agents** API.
- [inline::meta-reference](inline_meta-reference.md)
+## Providers
 ```{toctree}
 :maxdepth: 1
 inline_meta-reference
 ```
--- a/docs/source/providers/datasetio/index.md
+++ b/docs/source/providers/datasetio/index.md
@ -1,7 +1,15 @@
-# Datasetio Providers
+# Datasetio 
 ## Overview
 This section contains documentation for all available providers for the **datasetio** API.
- [inline::localfs](inline_localfs.md)
+## Providers
- [remote::huggingface](remote_huggingface.md)
+
- [remote::nvidia](remote_nvidia.md)
+```{toctree}
 :maxdepth: 1
 inline_localfs
 remote_huggingface
 remote_nvidia
 ```
--- a/docs/source/providers/eval/index.md
+++ b/docs/source/providers/eval/index.md
@ -1,6 +1,14 @@
-# Eval Providers
+# Eval 
 ## Overview
 This section contains documentation for all available providers for the **eval** API.
- [inline::meta-reference](inline_meta-reference.md)
+## Providers
- [remote::nvidia](remote_nvidia.md)
+
 ```{toctree}
 :maxdepth: 1
 inline_meta-reference
 remote_nvidia
 ```
--- a/docs/source/providers/external/external-providers-guide.md
+++ b/docs/source/providers/external/external-providers-guide.md
@ -1,9 +1,4 @@
-# External Providers Guide
+# Creating External Providers
 Llama Stack supports external providers that live outside of the main codebase. This allows you to:
 - Create and maintain your own providers independently
 - Share providers with others without contributing to the main codebase
 - Keep provider-specific code separate from the core Llama Stack code
 ## Configuration
@ -12,8 +7,7 @@ To enable external providers, you need to add `module` into your build yaml, all
 an example entry in your build.yaml should look like:
 ```
- provider_id: ramalama
+- provider_type: remote::ramalama
  provider_type: remote::ramalama
  module: ramalama_stack
 ```
@ -56,17 +50,6 @@ Llama Stack supports two types of external providers:
 1. **Remote Providers**: Providers that communicate with external services (e.g., cloud APIs)
 2. **Inline Providers**: Providers that run locally within the Llama Stack process
 ## Known External Providers
 Here's a list of known external providers that you can use with Llama Stack:
 | Name | Description | API | Type | Repository |
 |------|-------------|-----|------|------------|
 | KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
 | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
 | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
 ### Remote Provider Specification
 Remote providers are used when you need to communicate with external services. Here's an example for a custom Ollama provider:
@ -120,9 +103,9 @@ container_image: custom-vector-store:latest  # optional
 - `provider_data_validator`: Optional validator for provider data
 - `container_image`: Optional container image to use instead of pip packages
-## Required Implementation
+## Required Fields
-## All Providers
+### All Providers
 All providers must contain a `get_provider_spec` function in their `provider` module. This is a standardized structure that Llama Stack expects and is necessary for getting things such as the config class. The `get_provider_spec` method returns a structure identical to the `adapter`. An example function may look like:
@ -147,7 +130,7 @@ def get_provider_spec() -> ProviderSpec:
    )
 ```
-### Remote Providers
+#### Remote Providers
 Remote providers must expose a `get_adapter_impl()` function in their module that takes two arguments:
 1. `config`: An instance of the provider's config class
@ -163,7 +146,7 @@ async def get_adapter_impl(
    return OllamaInferenceAdapter(config)
 ```
-### Inline Providers
+#### Inline Providers
 Inline providers must expose a `get_provider_impl()` function in their module that takes two arguments:
 1. `config`: An instance of the provider's config class
@ -190,7 +173,40 @@ Version: 0.1.0
 Location: /path/to/venv/lib/python3.10/site-packages
 ```
-## Example using `external_providers_dir`: Custom Ollama Provider
+## Best Practices
 1. **Package Naming**: Use the prefix `llama-stack-provider-` for your provider packages to make them easily identifiable.
 2. **Version Management**: Keep your provider package versioned and compatible with the Llama Stack version you're using.
 3. **Dependencies**: Only include the minimum required dependencies in your provider package.
 4. **Documentation**: Include clear documentation in your provider package about:
   - Installation requirements
   - Configuration options
   - Usage examples
   - Any limitations or known issues
 5. **Testing**: Include tests in your provider package to ensure it works correctly with Llama Stack.
 You can refer to the [integration tests
 guide](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more
 information. Execute the test for the Provider type you are developing.
 ## Troubleshooting
 If your external provider isn't being loaded:
 1. Check that `module` points to a published pip package with a top level `provider` module including `get_provider_spec`.
 1. Check that the `external_providers_dir` path is correct and accessible.
 2. Verify that the YAML files are properly formatted.
 3. Ensure all required Python packages are installed.
 4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more
   information using `LLAMA_STACK_LOGGING=all=debug`.
 5. Verify that the provider package is installed in your Python environment if using `external_providers_dir`.
 ## Examples
 ### Example using `external_providers_dir`: Custom Ollama Provider
 Here's a complete example of creating and using a custom Ollama provider:
@ -242,7 +258,7 @@ external_providers_dir: ~/.llama/providers.d/
 The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
-## Example using `module`: ramalama-stack
+### Example using `module`: ramalama-stack
 [ramalama-stack](https://github.com/containers/ramalama-stack) is a recognized external provider that supports installation via module.
@ -255,8 +271,7 @@ distribution_spec:
  container_image: null
  providers:
    inference:
-    - provider_id: ramalama
+    - provider_type: remote::ramalama
      provider_type: remote::ramalama
      module: ramalama_stack==0.3.0a0
 image_type: venv
 image_name: null
@ -268,35 +283,4 @@ additional_pip_packages:
 No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc.
-The provider will now be available in Llama Stack with the type `remote::ramalama`.
+The provider will now be available in Llama Stack with the type `remote::ramalama`.
 ## Best Practices
 1. **Package Naming**: Use the prefix `llama-stack-provider-` for your provider packages to make them easily identifiable.
 2. **Version Management**: Keep your provider package versioned and compatible with the Llama Stack version you're using.
 3. **Dependencies**: Only include the minimum required dependencies in your provider package.
 4. **Documentation**: Include clear documentation in your provider package about:
   - Installation requirements
   - Configuration options
   - Usage examples
   - Any limitations or known issues
 5. **Testing**: Include tests in your provider package to ensure it works correctly with Llama Stack.
 You can refer to the [integration tests
 guide](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more
 information. Execute the test for the Provider type you are developing.
 ## Troubleshooting
 If your external provider isn't being loaded:
 1. Check that `module` points to a published pip package with a top level `provider` module including `get_provider_spec`.
 1. Check that the `external_providers_dir` path is correct and accessible.
 2. Verify that the YAML files are properly formatted.
 3. Ensure all required Python packages are installed.
 4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more
   information using `LLAMA_STACK_LOGGING=all=debug`.
 5. Verify that the provider package is installed in your Python environment if using `external_providers_dir`.
--- a/docs/source/providers/external/external-providers-list.md
+++ b/docs/source/providers/external/external-providers-list.md
@ -0,0 +1,10 @@
 # Known External Providers
 Here's a list of known external providers that you can use with Llama Stack:
 | Name | Description | API | Type | Repository |
 |------|-------------|-----|------|------------|
 | KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
 | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
 | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
--- a/docs/source/providers/external/index.md
+++ b/docs/source/providers/external/index.md
@ -0,0 +1,13 @@
 # External Providers
 Llama Stack supports external providers that live outside of the main codebase. This allows you to:
 - Create and maintain your own providers independently
 - Share providers with others without contributing to the main codebase
 - Keep provider-specific code separate from the core Llama Stack code
 ```{toctree}
 :maxdepth: 1
 external-providers-list
 external-providers-guide
 ```
--- a/docs/source/providers/files/index.md
+++ b/docs/source/providers/files/index.md
@ -1,5 +1,13 @@
-# Files Providers
+# Files 
 ## Overview
 This section contains documentation for all available providers for the **files** API.
- [inline::localfs](inline_localfs.md)
+## Providers
 ```{toctree}
 :maxdepth: 1
 inline_localfs
 ```
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -1,4 +1,4 @@
-# API Providers Overview
+# API Providers
 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
@ -12,81 +12,17 @@ Providers come in two flavors:
 Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
 ## External Providers
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.
 ```{toctree}
 :maxdepth: 1
 external.md
 ```
 ```{include} openai.md
 :start-after: ## OpenAI API Compatibility
 ```
 ## Inference
 Runs inference with an LLM.
 ```{toctree}
 :maxdepth: 1
 external/index
 openai
 inference/index
 ```
 ## Agents
 Run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
 ```{toctree}
 :maxdepth: 1
 agents/index
 ```
 ## DatasetIO
 Interfaces with datasets and data loaders.
 ```{toctree}
 :maxdepth: 1
 datasetio/index
 ```
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.
 ```{toctree}
 :maxdepth: 1
 safety/index
 ```
 ## Telemetry
 Collects telemetry data from the system.
 ```{toctree}
 :maxdepth: 1
 telemetry/index
 ```
 ## Vector IO
 Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
 Vector IO plays a crucial role in [Retreival Augmented Generation (RAG)](../..//building_applications/rag), where the vector
 io and database are used to store and retrieve documents for retrieval.
 ```{toctree}
 :maxdepth: 1
 vector_io/index
 ```
 ## Tool Runtime
 Is associated with the ToolGroup resources.
 ```{toctree}
 :maxdepth: 1
 tool_runtime/index
-```
+files/index
 ```
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -1,26 +1,34 @@
-# Inference Providers
+# Inference 
 ## Overview
 This section contains documentation for all available providers for the **inference** API.
- [inline::meta-reference](inline_meta-reference.md)
+## Providers
- [inline::sentence-transformers](inline_sentence-transformers.md)
+
- [remote::anthropic](remote_anthropic.md)
+```{toctree}
- [remote::bedrock](remote_bedrock.md)
+:maxdepth: 1
- [remote::cerebras](remote_cerebras.md)
+
- [remote::databricks](remote_databricks.md)
+inline_meta-reference
- [remote::fireworks](remote_fireworks.md)
+inline_sentence-transformers
- [remote::gemini](remote_gemini.md)
+remote_anthropic
- [remote::groq](remote_groq.md)
+remote_bedrock
- [remote::hf::endpoint](remote_hf_endpoint.md)
+remote_cerebras
- [remote::hf::serverless](remote_hf_serverless.md)
+remote_databricks
- [remote::llama-openai-compat](remote_llama-openai-compat.md)
+remote_fireworks
- [remote::nvidia](remote_nvidia.md)
+remote_gemini
- [remote::ollama](remote_ollama.md)
+remote_groq
- [remote::openai](remote_openai.md)
+remote_hf_endpoint
- [remote::passthrough](remote_passthrough.md)
+remote_hf_serverless
- [remote::runpod](remote_runpod.md)
+remote_llama-openai-compat
- [remote::sambanova](remote_sambanova.md)
+remote_nvidia
- [remote::tgi](remote_tgi.md)
+remote_ollama
- [remote::together](remote_together.md)
+remote_openai
- [remote::vllm](remote_vllm.md)
+remote_passthrough
- [remote::watsonx](remote_watsonx.md)
+remote_runpod
 remote_sambanova
 remote_tgi
 remote_together
 remote_vllm
 remote_watsonx
 ```
--- a/docs/source/providers/inference/remote_cerebras-openai-compat.md
+++ b/docs/source/providers/inference/remote_cerebras-openai-compat.md
@ -1,21 +0,0 @@
 # remote::cerebras-openai-compat
 ## Description
 Cerebras OpenAI-compatible provider for using Cerebras models with OpenAI API format.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Cerebras API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.cerebras.ai/v1 | The URL for the Cerebras API server |
 ## Sample Configuration
 ```yaml
 openai_compat_api_base: https://api.cerebras.ai/v1
 api_key: ${env.CEREBRAS_API_KEY}
 ```
--- a/docs/source/providers/inference/remote_fireworks-openai-compat.md
+++ b/docs/source/providers/inference/remote_fireworks-openai-compat.md
@ -1,21 +0,0 @@
 # remote::fireworks-openai-compat
 ## Description
 Fireworks AI OpenAI-compatible provider for using Fireworks models with OpenAI API format.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Fireworks API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks API server |
 ## Sample Configuration
 ```yaml
 openai_compat_api_base: https://api.fireworks.ai/inference/v1
 api_key: ${env.FIREWORKS_API_KEY}
 ```
--- a/docs/source/providers/inference/remote_groq-openai-compat.md
+++ b/docs/source/providers/inference/remote_groq-openai-compat.md
@ -1,21 +0,0 @@
 # remote::groq-openai-compat
 ## Description
 Groq OpenAI-compatible provider for using Groq models with OpenAI API format.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Groq API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.groq.com/openai/v1 | The URL for the Groq API server |
 ## Sample Configuration
 ```yaml
 openai_compat_api_base: https://api.groq.com/openai/v1
 api_key: ${env.GROQ_API_KEY}
 ```
--- a/docs/source/providers/inference/remote_openai.md
+++ b/docs/source/providers/inference/remote_openai.md
@ -9,11 +9,13 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | API key for OpenAI models |
 | `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
 ## Sample Configuration
 ```yaml
 api_key: ${env.OPENAI_API_KEY:=}
 base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
 ```
--- a/docs/source/providers/inference/remote_together-openai-compat.md
+++ b/docs/source/providers/inference/remote_together-openai-compat.md
@ -1,21 +0,0 @@
 # remote::together-openai-compat
 ## Description
 Together AI OpenAI-compatible provider for using Together models with OpenAI API format.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Together API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together API server |
 ## Sample Configuration
 ```yaml
 openai_compat_api_base: https://api.together.xyz/v1
 api_key: ${env.TOGETHER_API_KEY}
 ```
--- a/docs/source/providers/post_training/index.md
+++ b/docs/source/providers/post_training/index.md
@ -1,7 +1,15 @@
-# Post_Training Providers
+# Post_Training 
 ## Overview
 This section contains documentation for all available providers for the **post_training** API.
- [inline::huggingface](inline_huggingface.md)
+## Providers
- [inline::torchtune](inline_torchtune.md)
+
- [remote::nvidia](remote_nvidia.md)
+```{toctree}
 :maxdepth: 1
 inline_huggingface
 inline_torchtune
 remote_nvidia
 ```
--- a/docs/source/providers/post_training/inline_huggingface.md
+++ b/docs/source/providers/post_training/inline_huggingface.md
@ -24,6 +24,10 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
 | `weight_decay` | `<class 'float'>` | No | 0.01 |  |
 | `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
 | `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
 | `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
 | `use_reference_model` | `<class 'bool'>` | No | True |  |
 | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
 | `dpo_output_dir` | `<class 'str'>` | No | ./checkpoints/dpo |  |
 ## Sample Configuration
--- a/docs/source/providers/safety/index.md
+++ b/docs/source/providers/safety/index.md
@ -1,10 +1,18 @@
-# Safety Providers
+# Safety 
 ## Overview
 This section contains documentation for all available providers for the **safety** API.
- [inline::code-scanner](inline_code-scanner.md)
+## Providers
- [inline::llama-guard](inline_llama-guard.md)
+
- [inline::prompt-guard](inline_prompt-guard.md)
+```{toctree}
- [remote::bedrock](remote_bedrock.md)
+:maxdepth: 1
- [remote::nvidia](remote_nvidia.md)
+
- [remote::sambanova](remote_sambanova.md)
+inline_code-scanner
 inline_llama-guard
 inline_prompt-guard
 remote_bedrock
 remote_nvidia
 remote_sambanova
 ```
--- a/docs/source/providers/scoring/index.md
+++ b/docs/source/providers/scoring/index.md
@ -1,7 +1,15 @@
-# Scoring Providers
+# Scoring 
 ## Overview
 This section contains documentation for all available providers for the **scoring** API.
- [inline::basic](inline_basic.md)
+## Providers
- [inline::braintrust](inline_braintrust.md)
+
- [inline::llm-as-judge](inline_llm-as-judge.md)
+```{toctree}
 :maxdepth: 1
 inline_basic
 inline_braintrust
 inline_llm-as-judge
 ```
--- a/docs/source/providers/telemetry/index.md
+++ b/docs/source/providers/telemetry/index.md
@ -1,5 +1,13 @@
-# Telemetry Providers
+# Telemetry 
 ## Overview
 This section contains documentation for all available providers for the **telemetry** API.
- [inline::meta-reference](inline_meta-reference.md)
+## Providers
 ```{toctree}
 :maxdepth: 1
 inline_meta-reference
 ```
--- a/docs/source/providers/tool_runtime/index.md
+++ b/docs/source/providers/tool_runtime/index.md
@ -1,10 +1,18 @@
-# Tool_Runtime Providers
+# Tool_Runtime 
 ## Overview
 This section contains documentation for all available providers for the **tool_runtime** API.
- [inline::rag-runtime](inline_rag-runtime.md)
+## Providers
- [remote::bing-search](remote_bing-search.md)
+
- [remote::brave-search](remote_brave-search.md)
+```{toctree}
- [remote::model-context-protocol](remote_model-context-protocol.md)
+:maxdepth: 1
- [remote::tavily-search](remote_tavily-search.md)
+
- [remote::wolfram-alpha](remote_wolfram-alpha.md)
+inline_rag-runtime
 remote_bing-search
 remote_brave-search
 remote_model-context-protocol
 remote_tavily-search
 remote_wolfram-alpha
 ```
--- a/docs/source/providers/vector_io/index.md
+++ b/docs/source/providers/vector_io/index.md
@ -1,16 +1,24 @@
-# Vector_Io Providers
+# Vector_Io 
 ## Overview
 This section contains documentation for all available providers for the **vector_io** API.
- [inline::chromadb](inline_chromadb.md)
+## Providers
- [inline::faiss](inline_faiss.md)
+
- [inline::meta-reference](inline_meta-reference.md)
+```{toctree}
- [inline::milvus](inline_milvus.md)
+:maxdepth: 1
- [inline::qdrant](inline_qdrant.md)
+
- [inline::sqlite-vec](inline_sqlite-vec.md)
+inline_chromadb
- [inline::sqlite_vec](inline_sqlite_vec.md)
+inline_faiss
- [remote::chromadb](remote_chromadb.md)
+inline_meta-reference
- [remote::milvus](remote_milvus.md)
+inline_milvus
- [remote::pgvector](remote_pgvector.md)
+inline_qdrant
- [remote::qdrant](remote_qdrant.md)
+inline_sqlite-vec
- [remote::weaviate](remote_weaviate.md)
+inline_sqlite_vec
 remote_chromadb
 remote_milvus
 remote_pgvector
 remote_qdrant
 remote_weaviate
 ```
--- a/docs/source/providers/vector_io/inline_qdrant.md
+++ b/docs/source/providers/vector_io/inline_qdrant.md
@ -51,11 +51,15 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
 ## Sample Configuration
 ```yaml
 path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
 ```
--- a/docs/source/providers/vector_io/remote_qdrant.md
+++ b/docs/source/providers/vector_io/remote_qdrant.md
@ -20,11 +20,15 @@ Please refer to the inline provider documentation.
 | `prefix` | `str \| None` | No |  |  |
 | `timeout` | `int \| None` | No |  |  |
 | `host` | `str \| None` | No |  |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
 ## Sample Configuration
 ```yaml
-api_key: ${env.QDRANT_API_KEY}
+api_key: ${env.QDRANT_API_KEY:=}
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
 ```
--- a/docs/source/providers/vector_io/remote_weaviate.md
+++ b/docs/source/providers/vector_io/remote_weaviate.md
@ -33,9 +33,19 @@ To install Weaviate see the [Weaviate quickstart documentation](https://weaviate
 See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `weaviate_api_key` | `str \| None` | No |  | The API key for the Weaviate instance |
 | `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
 ```yaml
 weaviate_api_key: null
 weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@ -366,7 +366,7 @@ The purpose of scoring function is to calculate the score for each example based
 Firstly, you can see if the existing [llama stack scoring functions](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/scoring) can fulfill your need. If not, you need to write a new scoring function based on what benchmark author / other open source repo describe.
 ### Add new benchmark into template
-Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in the [open-benchmark](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/open-benchmark/run.yaml)
+Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in the [open-benchmark](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/distributions/open-benchmark/run.yaml)
 Secondly, you need to add the new benchmark you just created under the `benchmarks` resource in the same template. To add the new benchmark, you need to have
 - `benchmark_id`: identifier of the benchmark
@ -378,7 +378,7 @@ Secondly, you need to add the new benchmark you just created under the `benchmar
 Spin up llama stack server with 'open-benchmark' templates
 ```
-llama stack run llama_stack/templates/open-benchmark/run.yaml
+llama stack run llama_stack/distributions/open-benchmark/run.yaml
 ```
--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@ -19,11 +19,11 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git
-    conda create -n myenv python=3.10
+    python -m venv myenv
-    conda activate myenv
+    source myenv/bin/activate  # On Windows: myenv\Scripts\activate
    cd llama-stack
-    $CONDA_PREFIX/bin/pip install -e .
+    pip install -e .
 ## Downloading models via CLI
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -19,11 +19,11 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git
-    conda create -n myenv python=3.10
+    python -m venv myenv
-    conda activate myenv
+    source myenv/bin/activate  # On Windows: myenv\Scripts\activate
    cd llama-stack
-    $CONDA_PREFIX/bin/pip install -e .
+    pip install -e .
 ## `llama` subcommands
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@ -66,7 +66,7 @@
        "from pydantic import BaseModel\n",
        "from termcolor import cprint\n",
        "\n",
-        "from llama_stack.distribution.datatypes import RemoteProviderConfig\n",
+        "from llama_stack.core.datatypes import RemoteProviderConfig\n",
        "from llama_stack.apis.safety import Safety\n",
        "from llama_stack_client import LlamaStackClient\n",
        "\n",
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -47,20 +47,20 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 ## Install Dependencies and Set Up Environment
-1. **Create a Conda Environment**:
+1. **Install uv**:
-   Create a new Conda environment with Python 3.12:
+   Install [uv](https://docs.astral.sh/uv/) for managing dependencies:
   ```bash
-   conda create -n ollama python=3.12
+   # macOS and Linux
-   ```
+   curl -LsSf https://astral.sh/uv/install.sh | sh
-   Activate the environment:
+
-   ```bash
+   # Windows
-   conda activate ollama
+   powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
   ```
 2. **Install ChromaDB**:
-   Install `chromadb` using `pip`:
+   Install `chromadb` using `uv`:
   ```bash
-   pip install chromadb
+   uv pip install chromadb
   ```
 3. **Run ChromaDB**:
@ -69,28 +69,21 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   chroma run --host localhost --port 8000 --path ./my_chroma_data
   ```
 4. **Install Llama Stack**:
   Open a new terminal and install `llama-stack`:
   ```bash
   conda activate ollama
   pip install -U llama-stack
   ```
 ---
 ## Build, Configure, and Run Llama Stack
 1. **Build the Llama Stack**:
-   Build the Llama Stack using the `ollama` template:
+   Build the Llama Stack using the `starter` template:
   ```bash
-   llama stack build --template starter --image-type conda
+   uv run --with llama-stack llama stack build --distro starter --image-type venv
   ```
   **Expected Output:**
   ```bash
   ...
   Build Successful!
-   You can find the newly-built template here: ~/.llama/distributions/ollama/ollama-run.yaml
+   You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
-   You can run the new Llama Stack Distro via: llama stack run ~/.llama/distributions/ollama/ollama-run.yaml --image-type conda
+   You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter --image-type venv
   ```
 3. **Set the ENV variables by exporting them to the terminal**:
@ -102,12 +95,13 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   ```
 3. **Run the Llama Stack**:
-   Run the stack with command shared by the API from earlier:
+   Run the stack using uv:
   ```bash
-   llama stack run ollama
+   uv run --with llama-stack llama stack run starter \
-      --port $LLAMA_STACK_PORT
+      --image-type venv \
-      --env INFERENCE_MODEL=$INFERENCE_MODEL
+      --port $LLAMA_STACK_PORT \
-      --env SAFETY_MODEL=$SAFETY_MODEL
+      --env INFERENCE_MODEL=$INFERENCE_MODEL \
      --env SAFETY_MODEL=$SAFETY_MODEL \
      --env OLLAMA_URL=$OLLAMA_URL
   ```
   Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
@ -120,7 +114,7 @@ After setting up the server, open a new terminal window and configure the llama-
 1. Configure the CLI to point to the llama-stack server.
   ```bash
-   llama-stack-client configure --endpoint http://localhost:8321
+   uv run --with llama-stack-client llama-stack-client configure --endpoint http://localhost:8321
   ```
   **Expected Output:**
   ```bash
@ -128,7 +122,7 @@ After setting up the server, open a new terminal window and configure the llama-
   ```
 2. Test the CLI by running inference:
   ```bash
-   llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon"
+   uv run --with llama-stack-client llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon"
   ```
   **Expected Output:**
   ```bash
@ -170,7 +164,7 @@ curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
 EOF
 ```
-You can check the available models with the command `llama-stack-client models list`.
+You can check the available models with the command `uv run --with llama-stack-client llama-stack-client models list`.
 **Expected Output:**
 ```json
@ -191,18 +185,12 @@ You can check the available models with the command `llama-stack-client models l
 You can also interact with the Llama Stack server using a simple Python script. Below is an example:
-### 1. Activate Conda Environment
+### 1. Create Python Script (`test_llama_stack.py`)
 ```bash
 conda activate ollama
 ```
 ### 2. Create Python Script (`test_llama_stack.py`)
 ```bash
 touch test_llama_stack.py
 ```
-### 3. Create a Chat Completion Request in Python
+### 2. Create a Chat Completion Request in Python
 In `test_llama_stack.py`, write the following code:
@ -233,10 +221,10 @@ response = client.inference.chat_completion(
 print(response.completion_message.content)
 ```
-### 4. Run the Python Script
+### 3. Run the Python Script
 ```bash
-python test_llama_stack.py
+uv run --with llama-stack-client python test_llama_stack.py
 ```
 **Expected Output:**
--- a/llama_stack/init.py
+++ b/llama_stack/init.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.distribution.library_client import (  # noqa: F401
+from llama_stack.core.library_client import (  # noqa: F401
    AsyncLlamaStackAsLibraryClient,
    LlamaStackAsLibraryClient,
 )
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -152,7 +152,17 @@ Step = Annotated[
@json_schema_type
 class Turn(BaseModel):
-    """A single turn in an interaction with an Agentic System."""
+    """A single turn in an interaction with an Agentic System.
    :param turn_id: Unique identifier for the turn within a session
    :param session_id: Unique identifier for the conversation session
    :param input_messages: List of messages that initiated this turn
    :param steps: Ordered list of processing steps executed during this turn
    :param output_message: The model's generated response containing content and metadata
    :param output_attachments: (Optional) Files or media attached to the agent's response
    :param started_at: Timestamp when the turn began
    :param completed_at: (Optional) Timestamp when the turn finished, if completed
    """
    turn_id: str
    session_id: str
@ -167,7 +177,13 @@ class Turn(BaseModel):
@json_schema_type
 class Session(BaseModel):
-    """A single session of an interaction with an Agentic System."""
+    """A single session of an interaction with an Agentic System.
    :param session_id: Unique identifier for the conversation session
    :param session_name: Human-readable name for the session
    :param turns: List of all turns that have occurred in this session
    :param started_at: Timestamp when the session was created
    """
    session_id: str
    session_name: str
@ -232,6 +248,13 @@ class AgentConfig(AgentConfigCommon):
@json_schema_type
 class Agent(BaseModel):
    """An agent instance with configuration and metadata.
    :param agent_id: Unique identifier for the agent
    :param agent_config: Configuration settings for the agent
    :param created_at: Timestamp when the agent was created
    """
    agent_id: str
    agent_config: AgentConfig
    created_at: datetime
@ -253,6 +276,14 @@ class AgentTurnResponseEventType(StrEnum):
@json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
    """Payload for step start events in agent turn responses.
    :param event_type: Type of event being reported
    :param step_type: Type of step being executed
    :param step_id: Unique identifier for the step within a turn
    :param metadata: (Optional) Additional metadata for the step
    """
    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
    step_type: StepType
    step_id: str
@ -261,6 +292,14 @@ class AgentTurnResponseStepStartPayload(BaseModel):
@json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
    """Payload for step completion events in agent turn responses.
    :param event_type: Type of event being reported
    :param step_type: Type of step being executed
    :param step_id: Unique identifier for the step within a turn
    :param step_details: Complete details of the executed step
    """
    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
    step_type: StepType
    step_id: str
@ -269,6 +308,14 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
@json_schema_type
 class AgentTurnResponseStepProgressPayload(BaseModel):
    """Payload for step progress events in agent turn responses.
    :param event_type: Type of event being reported
    :param step_type: Type of step being executed
    :param step_id: Unique identifier for the step within a turn
    :param delta: Incremental content changes during step execution
    """
    model_config = ConfigDict(protected_namespaces=())
    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
@ -280,18 +327,36 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
@json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
    """Payload for turn start events in agent turn responses.
    :param event_type: Type of event being reported
    :param turn_id: Unique identifier for the turn within a session
    """
    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
    turn_id: str
@json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
    """Payload for turn completion events in agent turn responses.
    :param event_type: Type of event being reported
    :param turn: Complete turn data including all steps and results
    """
    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
    turn: Turn
@json_schema_type
 class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
    """Payload for turn awaiting input events in agent turn responses.
    :param event_type: Type of event being reported
    :param turn: Turn data when waiting for external tool responses
    """
    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
    turn: Turn
@ -310,21 +375,47 @@ register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPaylo
@json_schema_type
 class AgentTurnResponseEvent(BaseModel):
    """An event in an agent turn response stream.
    :param payload: Event-specific payload containing event data
    """
    payload: AgentTurnResponseEventPayload
@json_schema_type
 class AgentCreateResponse(BaseModel):
    """Response returned when creating a new agent.
    :param agent_id: Unique identifier for the created agent
    """
    agent_id: str
@json_schema_type
 class AgentSessionCreateResponse(BaseModel):
    """Response returned when creating a new agent session.
    :param session_id: Unique identifier for the created session
    """
    session_id: str
@json_schema_type
 class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
    """Request to create a new turn for an agent.
    :param agent_id: Unique identifier for the agent
    :param session_id: Unique identifier for the conversation session
    :param messages: List of messages to start the turn with
    :param documents: (Optional) List of documents to provide to the agent
    :param toolgroups: (Optional) List of tool groups to make available for this turn
    :param stream: (Optional) Whether to stream the response
    :param tool_config: (Optional) Tool configuration to override agent defaults
    """
    agent_id: str
    session_id: str
@ -342,6 +433,15 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
@json_schema_type
 class AgentTurnResumeRequest(BaseModel):
    """Request to resume an agent turn with tool responses.
    :param agent_id: Unique identifier for the agent
    :param session_id: Unique identifier for the conversation session
    :param turn_id: Unique identifier for the turn within a session
    :param tool_responses: List of tool responses to submit to continue the turn
    :param stream: (Optional) Whether to stream the response
    """
    agent_id: str
    session_id: str
    turn_id: str
@ -351,13 +451,21 @@ class AgentTurnResumeRequest(BaseModel):
@json_schema_type
 class AgentTurnResponseStreamChunk(BaseModel):
-    """streamed agent turn completion response."""
+    """Streamed agent turn completion response.
    :param event: Individual event in the agent turn response stream
    """
    event: AgentTurnResponseEvent
@json_schema_type
 class AgentStepResponse(BaseModel):
    """Response containing details of a specific agent step.
    :param step: The complete step data and execution details
    """
    step: Step
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -18,18 +18,37 @@ from llama_stack.schema_utils import json_schema_type, register_schema
@json_schema_type
 class OpenAIResponseError(BaseModel):
    """Error details for failed OpenAI response requests.
    :param code: Error code identifying the type of failure
    :param message: Human-readable error message describing the failure
    """
    code: str
    message: str
@json_schema_type
 class OpenAIResponseInputMessageContentText(BaseModel):
    """Text content for input messages in OpenAI response format.
    :param text: The text content of the input message
    :param type: Content type identifier, always "input_text"
    """
    text: str
    type: Literal["input_text"] = "input_text"
@json_schema_type
 class OpenAIResponseInputMessageContentImage(BaseModel):
    """Image content for input messages in OpenAI response format.
    :param detail: Level of detail for image processing, can be "low", "high", or "auto"
    :param type: Content type identifier, always "input_image"
    :param image_url: (Optional) URL of the image content
    """
    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
    # TODO: handle file_id
@ -46,6 +65,14 @@ register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMess
@json_schema_type
 class OpenAIResponseAnnotationFileCitation(BaseModel):
    """File citation annotation for referencing specific files in response content.
    :param type: Annotation type identifier, always "file_citation"
    :param file_id: Unique identifier of the referenced file
    :param filename: Name of the referenced file
    :param index: Position index of the citation within the content
    """
    type: Literal["file_citation"] = "file_citation"
    file_id: str
    filename: str
@ -54,6 +81,15 @@ class OpenAIResponseAnnotationFileCitation(BaseModel):
@json_schema_type
 class OpenAIResponseAnnotationCitation(BaseModel):
    """URL citation annotation for referencing external web resources.
    :param type: Annotation type identifier, always "url_citation"
    :param end_index: End position of the citation span in the content
    :param start_index: Start position of the citation span in the content
    :param title: Title of the referenced web resource
    :param url: URL of the referenced web resource
    """
    type: Literal["url_citation"] = "url_citation"
    end_index: int
    start_index: int
@ -122,6 +158,13 @@ class OpenAIResponseMessage(BaseModel):
@json_schema_type
 class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
    """Web search tool call output message for OpenAI responses.
    :param id: Unique identifier for this tool call
    :param status: Current status of the web search operation
    :param type: Tool call type identifier, always "web_search_call"
    """
    id: str
    status: str
    type: Literal["web_search_call"] = "web_search_call"
@ -129,6 +172,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
@json_schema_type
 class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """File search tool call output message for OpenAI responses.
    :param id: Unique identifier for this tool call
    :param queries: List of search queries executed
    :param status: Current status of the file search operation
    :param type: Tool call type identifier, always "file_search_call"
    :param results: (Optional) Search results returned by the file search operation
    """
    id: str
    queries: list[str]
    status: str
@ -138,6 +190,16 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
@json_schema_type
 class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
    """Function tool call output message for OpenAI responses.
    :param call_id: Unique identifier for the function call
    :param name: Name of the function being called
    :param arguments: JSON string containing the function arguments
    :param type: Tool call type identifier, always "function_call"
    :param id: (Optional) Additional identifier for the tool call
    :param status: (Optional) Current status of the function call execution
    """
    call_id: str
    name: str
    arguments: str
@ -148,6 +210,17 @@ class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
@json_schema_type
 class OpenAIResponseOutputMessageMCPCall(BaseModel):
    """Model Context Protocol (MCP) call output message for OpenAI responses.
    :param id: Unique identifier for this MCP call
    :param type: Tool call type identifier, always "mcp_call"
    :param arguments: JSON string containing the MCP call arguments
    :param name: Name of the MCP method being called
    :param server_label: Label identifying the MCP server handling the call
    :param error: (Optional) Error message if the MCP call failed
    :param output: (Optional) Output result from the successful MCP call
    """
    id: str
    type: Literal["mcp_call"] = "mcp_call"
    arguments: str
@ -158,6 +231,13 @@ class OpenAIResponseOutputMessageMCPCall(BaseModel):
 class MCPListToolsTool(BaseModel):
    """Tool definition returned by MCP list tools operation.
    :param input_schema: JSON schema defining the tool's input parameters
    :param name: Name of the tool
    :param description: (Optional) Description of what the tool does
    """
    input_schema: dict[str, Any]
    name: str
    description: str | None = None
@ -165,6 +245,14 @@ class MCPListToolsTool(BaseModel):
@json_schema_type
 class OpenAIResponseOutputMessageMCPListTools(BaseModel):
    """MCP list tools output message containing available tools from an MCP server.
    :param id: Unique identifier for this MCP list tools operation
    :param type: Tool call type identifier, always "mcp_list_tools"
    :param server_label: Label identifying the MCP server providing the tools
    :param tools: List of available tools provided by the MCP server
    """
    id: str
    type: Literal["mcp_list_tools"] = "mcp_list_tools"
    server_label: str
@ -206,11 +294,34 @@ class OpenAIResponseTextFormat(TypedDict, total=False):
@json_schema_type
 class OpenAIResponseText(BaseModel):
    """Text response configuration for OpenAI responses.
    :param format: (Optional) Text format configuration specifying output format requirements
    """
    format: OpenAIResponseTextFormat | None = None
@json_schema_type
 class OpenAIResponseObject(BaseModel):
    """Complete OpenAI response object containing generation results and metadata.
    :param created_at: Unix timestamp when the response was created
    :param error: (Optional) Error details if the response generation failed
    :param id: Unique identifier for this response
    :param model: Model identifier used for generation
    :param object: Object type identifier, always "response"
    :param output: List of generated output items (messages, tool calls, etc.)
    :param parallel_tool_calls: Whether tool calls can be executed in parallel
    :param previous_response_id: (Optional) ID of the previous response in a conversation
    :param status: Current status of the response generation
    :param temperature: (Optional) Sampling temperature used for generation
    :param text: Text formatting configuration for the response
    :param top_p: (Optional) Nucleus sampling parameter used for generation
    :param truncation: (Optional) Truncation strategy applied to the response
    :param user: (Optional) User identifier associated with the request
    """
    created_at: int
    error: OpenAIResponseError | None = None
    id: str
@ -231,6 +342,13 @@ class OpenAIResponseObject(BaseModel):
@json_schema_type
 class OpenAIDeleteResponseObject(BaseModel):
    """Response object confirming deletion of an OpenAI response.
    :param id: Unique identifier of the deleted response
    :param object: Object type identifier, always "response"
    :param deleted: Deletion confirmation flag, always True
    """
    id: str
    object: Literal["response"] = "response"
    deleted: bool = True
@ -238,18 +356,39 @@ class OpenAIDeleteResponseObject(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseCreated(BaseModel):
    """Streaming event indicating a new response has been created.
    :param response: The newly created response object
    :param type: Event type identifier, always "response.created"
    """
    response: OpenAIResponseObject
    type: Literal["response.created"] = "response.created"
@json_schema_type
 class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
    """Streaming event indicating a response has been completed.
    :param response: The completed response object
    :param type: Event type identifier, always "response.completed"
    """
    response: OpenAIResponseObject
    type: Literal["response.completed"] = "response.completed"
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
    """Streaming event for when a new output item is added to the response.
    :param response_id: Unique identifier of the response containing this output
    :param item: The output item that was added (message, tool call, etc.)
    :param output_index: Index position of this item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.output_item.added"
    """
    response_id: str
    item: OpenAIResponseOutput
    output_index: int
@ -259,6 +398,15 @@ class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
    """Streaming event for when an output item is completed.
    :param response_id: Unique identifier of the response containing this output
    :param item: The completed output item (message, tool call, etc.)
    :param output_index: Index position of this item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.output_item.done"
    """
    response_id: str
    item: OpenAIResponseOutput
    output_index: int
@ -268,6 +416,16 @@ class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
    """Streaming event for incremental text content updates.
    :param content_index: Index position within the text content
    :param delta: Incremental text content being added
    :param item_id: Unique identifier of the output item being updated
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.output_text.delta"
    """
    content_index: int
    delta: str
    item_id: str
@ -278,6 +436,16 @@ class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
    """Streaming event for when text output is completed.
    :param content_index: Index position within the text content
    :param text: Final complete text content of the output item
    :param item_id: Unique identifier of the completed output item
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.output_text.done"
    """
    content_index: int
    text: str  # final text of the output item
    item_id: str
@ -288,6 +456,15 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
    """Streaming event for incremental function call argument updates.
    :param delta: Incremental function call arguments being added
    :param item_id: Unique identifier of the function call being updated
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.function_call_arguments.delta"
    """
    delta: str
    item_id: str
    output_index: int
@ -297,6 +474,15 @@ class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
    """Streaming event for when function call arguments are completed.
    :param arguments: Final complete arguments JSON string for the function call
    :param item_id: Unique identifier of the completed function call
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.function_call_arguments.done"
    """
    arguments: str  # final arguments of the function call
    item_id: str
    output_index: int
@ -306,6 +492,14 @@ class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseWebSearchCallInProgress(BaseModel):
    """Streaming event for web search calls in progress.
    :param item_id: Unique identifier of the web search call
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.web_search_call.in_progress"
    """
    item_id: str
    output_index: int
    sequence_number: int
@ -322,6 +516,14 @@ class OpenAIResponseObjectStreamResponseWebSearchCallSearching(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseWebSearchCallCompleted(BaseModel):
    """Streaming event for completed web search calls.
    :param item_id: Unique identifier of the completed web search call
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.web_search_call.completed"
    """
    item_id: str
    output_index: int
    sequence_number: int
@ -366,6 +568,14 @@ class OpenAIResponseObjectStreamResponseMcpCallArgumentsDone(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
    """Streaming event for MCP calls in progress.
    :param item_id: Unique identifier of the MCP call
    :param output_index: Index position of the item in the output list
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.mcp_call.in_progress"
    """
    item_id: str
    output_index: int
    sequence_number: int
@ -374,12 +584,24 @@ class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallFailed(BaseModel):
    """Streaming event for failed MCP calls.
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.mcp_call.failed"
    """
    sequence_number: int
    type: Literal["response.mcp_call.failed"] = "response.mcp_call.failed"
@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
    """Streaming event for completed MCP calls.
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.mcp_call.completed"
    """
    sequence_number: int
    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
@ -442,6 +664,12 @@ WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_20
@json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
    """Web search tool configuration for OpenAI response inputs.
    :param type: Web search tool type variant to use
    :param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
    """
    # Must match values of WebSearchToolTypes above
    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
        "web_search"
@ -453,6 +681,15 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
@json_schema_type
 class OpenAIResponseInputToolFunction(BaseModel):
    """Function tool configuration for OpenAI response inputs.
    :param type: Tool type identifier, always "function"
    :param name: Name of the function that can be called
    :param description: (Optional) Description of what the function does
    :param parameters: (Optional) JSON schema defining the function's parameters
    :param strict: (Optional) Whether to enforce strict parameter validation
    """
    type: Literal["function"] = "function"
    name: str
    description: str | None = None
@ -462,6 +699,15 @@ class OpenAIResponseInputToolFunction(BaseModel):
@json_schema_type
 class OpenAIResponseInputToolFileSearch(BaseModel):
    """File search tool configuration for OpenAI response inputs.
    :param type: Tool type identifier, always "file_search"
    :param vector_store_ids: List of vector store identifiers to search within
    :param filters: (Optional) Additional filters to apply to the search
    :param max_num_results: (Optional) Maximum number of search results to return (1-50)
    :param ranking_options: (Optional) Options for ranking and scoring search results
    """
    type: Literal["file_search"] = "file_search"
    vector_store_ids: list[str]
    filters: dict[str, Any] | None = None
@ -470,16 +716,37 @@ class OpenAIResponseInputToolFileSearch(BaseModel):
 class ApprovalFilter(BaseModel):
    """Filter configuration for MCP tool approval requirements.
    :param always: (Optional) List of tool names that always require approval
    :param never: (Optional) List of tool names that never require approval
    """
    always: list[str] | None = None
    never: list[str] | None = None
 class AllowedToolsFilter(BaseModel):
    """Filter configuration for restricting which MCP tools can be used.
    :param tool_names: (Optional) List of specific tool names that are allowed
    """
    tool_names: list[str] | None = None
@json_schema_type
 class OpenAIResponseInputToolMCP(BaseModel):
    """Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
    :param type: Tool type identifier, always "mcp"
    :param server_label: Label to identify this MCP server
    :param server_url: URL endpoint of the MCP server
    :param headers: (Optional) HTTP headers to include when connecting to the server
    :param require_approval: Approval requirement for tool calls ("always", "never", or filter)
    :param allowed_tools: (Optional) Restriction on which tools can be used from this server
    """
    type: Literal["mcp"] = "mcp"
    server_label: str
    server_url: str
@ -500,17 +767,37 @@ register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
 class ListOpenAIResponseInputItem(BaseModel):
    """List container for OpenAI response input items.
    :param data: List of input items
    :param object: Object type identifier, always "list"
    """
    data: list[OpenAIResponseInput]
    object: Literal["list"] = "list"
@json_schema_type
 class OpenAIResponseObjectWithInput(OpenAIResponseObject):
    """OpenAI response object extended with input context information.
    :param input: List of input items that led to this response
    """
    input: list[OpenAIResponseInput]
@json_schema_type
 class ListOpenAIResponseObject(BaseModel):
    """Paginated list of OpenAI response objects with navigation metadata.
    :param data: List of response objects with their input context
    :param has_more: Whether there are more results available beyond this page
    :param first_id: Identifier of the first item in this page
    :param last_id: Identifier of the last item in this page
    :param object: Object type identifier, always "list"
    """
    data: list[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
--- a/Show more
+++ b/Show more
`@ -1 +1 @@`
	The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.	The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.