Merge branch 'main' into add-mcp-streamable-http-support

2025-12-22 22:39:41 +00:00 · 2025-07-18 14:38:54 -04:00 · 2025-07-18 14:38:54 -04:00 · c715f30e65
commit c715f30e65
parent 211800146e d994305f0a
247 changed files with 9685 additions and 5249 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -4,3 +4,9 @@ omit =
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
+    */llama_stack/cli/scripts/*
+    */llama_stack/ui/*
+    */llama_stack/distribution/ui/*
+    */llama_stack/strong_typing/*
+    */llama_stack/env.py
+    */__init__.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
--- a/.github/ISSUE_TEMPLATE/tech-debt.yml
+++ b/.github/ISSUE_TEMPLATE/tech-debt.yml
@ -0,0 +1,30 @@
+name: 🔧 Tech Debt
+description: Something that is functional but should be improved or optimizied
+labels: ["tech-debt"]
+body:
+- type: textarea
+  id: tech-debt-explanation
+  attributes:
+    label: 🤔 What is the technical debt you think should be addressed?
+    description: >
+      A clear and concise description of _what_ needs to be addressed - ensure you are describing
+      constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
+      or feature request.
+  validations:
+    required: true
+
+- type: textarea
+  id: tech-debt-motivation
+  attributes:
+    label: 💡 What is the benefit of addressing this technical debt?
+    description: >
+      A clear and concise description of _why_ this work is needed.
+  validations:
+    required: true
+
+- type: textarea
+  id: other-thoughts
+  attributes:
+    label: Other thoughts
+    description: >
+      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -7,3 +7,5 @@ runs:
      shell: bash
      run: |
        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
+        echo "Verifying Ollama status..."
+        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -5,6 +5,10 @@ inputs:
    description: The Python version to use
    required: false
    default: "3.12"
+  client-version:
+    description: The llama-stack-client-python version to test against (latest or published)
+    required: false
+    default: "latest"
 runs:
  using: "composite"
  steps:
@ -20,8 +24,17 @@ runs:
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
-        # always test against the latest version of the client
-        # TODO: this is not necessarily a good idea. we need to test against both published and latest
-        # to find out backwards compatibility issues.
-        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+
+        # Install llama-stack-client-python based on the client-version input
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
+          echo "Installing latest llama-stack-client-python from main branch"
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          echo "Installing published llama-stack-client-python from PyPI"
+          uv pip install llama-stack-client
+        else
+          echo "Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
+
        uv pip install -e .
--- a/.github/workflows/coverage-badge.yml
+++ b/.github/workflows/coverage-badge.yml
@ -0,0 +1,57 @@
+name: Coverage Badge
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/unit/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/unit-tests.yml'
+      - '.github/workflows/coverage-badge.yml' # This workflow
+  workflow_dispatch:
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Run unit tests
+        run: |
+          ./scripts/unit-tests.sh
+
+      - name: Coverage Badge
+        uses: tj-actions/coverage-badge-py@1788babcb24544eb5bbb6e0d374df5d1e54e670f # v2.0.4
+
+      - name: Verify Changed files
+        uses: tj-actions/verify-changed-files@a1c6acee9df209257a246f2cc6ae8cb6581c1edf # v20.0.4
+        id: verify-changed-files
+        with:
+          files: coverage.svg
+
+      - name: Commit files
+        if: steps.verify-changed-files.outputs.files_changed == 'true'
+        run: |
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+          git add coverage.svg
+          git commit -m "Updated coverage.svg"
+
+      - name: Create Pull Request
+        if: steps.verify-changed-files.outputs.files_changed == 'true'
+        uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: "ci: [Automatic] Coverage Badge Update"
+          body: |
+            This PR updates the coverage badge based on the latest coverage report.
+
+            Automatically generated by the [workflow coverage-badge.yaml](.github/workflows/coverage-badge.yaml)
+          delete-branch: true
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@ -1,355 +0,0 @@
-name: "Run Llama-stack Tests"
-
-on:
-  #### Temporarily disable PR runs until tests run as intended within mainline.
-  #TODO Add this back.
-  #pull_request_target:
-  #  types: ["opened"]
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'llama_stack/**/*.py'
-  #    - 'tests/**/*.py'
-
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'GHA Runner Scale Set label to run workflow on.'
-        required: true
-        default: "llama-stack-gha-runner-gpu"
-
-      checkout_reference:
-        description: "The branch, tag, or SHA to checkout"
-        required: true
-        default: "main"
-
-      debug:
-        description: 'Run debugging steps?'
-        required: false
-        default: "true"
-
-      sleep_time:
-        description: '[DEBUG] sleep time for debugging'
-        required: true
-        default: "0"
-
-      provider_id:
-        description: 'ID of your provider'
-        required: true
-        default: "meta_reference"
-
-      model_id:
-        description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
-        required: true
-        default: "llama_3b"
-
-      model_override_3b:
-        description: 'Specify shorthand model for <llama_3b> '
-        required: false
-        default: "Llama3.2-3B-Instruct"
-
-      model_override_8b:
-        description: 'Specify shorthand model for <llama_8b> '
-        required: false
-        default: "Llama3.1-8B-Instruct"
-
-env:
-  # ID used for each test's provider config
-  PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
-
-  # Path to model checkpoints within EFS volume
-  MODEL_CHECKPOINT_DIR: "/data/llama"
-
-  # Path to directory to run tests from
-  TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
-
-  # Keep track of a list of model IDs that are valid to use within pytest fixture marks
-  AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
-
-  # Shorthand name for model ID, used in pytest fixture marks
-  MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
-
-  # Override the `llama_3b` / `llama_8b' models, else use the default.
-  LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
-  LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
-
-  # Defines which directories in TESTS_PATH to exclude from the test loop
-  EXCLUDED_DIRS: "__pycache__"
-
-  # Defines the output xml reports generated after a test is run
-  REPORTS_GEN: ""
-
-jobs:
-  execute_workflow:
-    name: Execute workload on Self-Hosted GPU k8s runner
-    permissions:
-      pull-requests: write
-    defaults:
-      run:
-        shell: bash
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
-    if: always()
-    steps:
-
-      ##############################
-      #### INITIAL DEBUG CHECKS ####
-      ##############################
-      - name: "[DEBUG] Check content of the EFS mount"
-        id: debug_efs_volume
-        continue-on-error: true
-        if: inputs.debug == 'true'
-        run: |
-            echo "========= Content of the EFS mount ============="
-            ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
-
-      - name: "[DEBUG] Get runner container OS information"
-        id: debug_os_info
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            cat /etc/os-release
-
-      - name: "[DEBUG] Print environment variables"
-        id: debug_env_vars
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            echo "PROVIDER_ID = ${PROVIDER_ID}"
-            echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
-            echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
-            echo "MODEL_ID = ${MODEL_ID}"
-            echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
-            echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
-            echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
-            echo "REPORTS_GEN = ${REPORTS_GEN}"
-
-      ############################
-      #### MODEL INPUT CHECKS ####
-      ############################
-
-      - name: "Check if env.model_id is valid"
-        id: check_model_id
-        run: |
-          if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
-            echo "Model ID '${MODEL_ID}' is valid."
-          else
-            echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
-            exit 1
-          fi
-
-      #######################
-      #### CODE CHECKOUT ####
-      #######################
-      - name: "Checkout 'meta-llama/llama-stack' repository"
-        id: checkout_repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: ${{ inputs.branch }}
-
-      - name: "[DEBUG] Content of the repository after checkout"
-        id: debug_content_after_checkout
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            ls -la ${GITHUB_WORKSPACE}
-
-      ##########################################################
-      ####              OPTIONAL SLEEP DEBUG                ####
-      #                                                        #
-      # Use to "exec" into the test k8s POD and run tests      #
-      # manually to identify what dependencies are being used. #
-      #                                                        #
-      ##########################################################
-      - name: "[DEBUG] sleep"
-        id: debug_sleep
-        if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
-        run: |
-            sleep ${{ inputs.sleep_time }}
-
-      ############################
-      #### UPDATE SYSTEM PATH ####
-      ############################
-      - name: "Update path: execute"
-        id: path_update_exec
-        run: |
-          # .local/bin is needed for certain libraries installed below to be recognized
-          # when calling their executable to install sub-dependencies
-          mkdir -p ${HOME}/.local/bin
-          echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
-
-      #####################################
-      #### UPDATE CHECKPOINT DIRECTORY ####
-      #####################################
-      - name: "Update checkpoint directory"
-        id: checkpoint_update
-        run: |
-          echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
-          if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
-            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
-          elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
-            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
-          else
-            echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
-            exit 1
-          fi
-
-      - name: "[DEBUG] Checkpoint update check"
-        id: debug_checkpoint_update
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-          echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
-
-      ##################################
-      #### DEPENDENCY INSTALLATIONS ####
-      ##################################
-      - name: "Installing 'apt' required packages"
-        id: install_apt
-        run: |
-          echo "[STEP] Installing 'apt' required packages"
-          sudo apt update -y
-          sudo apt install -y python3 python3-pip npm wget
-
-      - name: "Installing packages with 'curl'"
-        id: install_curl
-        run: |
-          curl -fsSL https://ollama.com/install.sh | sh
-
-      - name: "Installing packages with 'wget'"
-        id: install_wget
-        run: |
-          wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-          chmod +x Miniconda3-latest-Linux-x86_64.sh
-          ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
-          # Add miniconda3 bin to system path
-          echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
-
-      - name: "Installing packages with 'npm'"
-        id: install_npm_generic
-        run: |
-          sudo npm install -g junit-merge
-
-      - name: "Installing pip dependencies"
-        id: install_pip_generic
-        run: |
-          echo "[STEP] Installing 'llama-stack' models"
-          pip install -U pip setuptools
-          pip install -r requirements.txt
-          pip install -e .
-          pip install -U \
-            torch torchvision \
-            pytest pytest_asyncio \
-            fairscale lm-format-enforcer \
-            zmq chardet pypdf \
-            pandas sentence_transformers together \
-            aiosqlite
-      - name: "Installing packages with conda"
-        id: install_conda_generic
-        run: |
-          conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
-
-      #############################################################
-      #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
-      #############################################################
-      - name: "Run Tests: Loop"
-        id: run_tests_loop
-        working-directory: "${{ github.workspace }}"
-        run: |
-          pattern=""
-          for dir in llama_stack/providers/tests/*; do
-            if [ -d "$dir" ]; then
-              dir_name=$(basename "$dir")
-              if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
-                for file in "$dir"/test_*.py; do
-                  test_name=$(basename "$file")
-                  new_file="result-${dir_name}-${test_name}.xml"
-                  if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
-                     --junitxml="${{ github.workspace }}/${new_file}"; then
-                    echo "Ran test: ${test_name}"
-                  else
-                    echo "Did NOT run test: ${test_name}"
-                  fi
-                  pattern+="${new_file} "
-                done
-              fi
-            fi
-          done
-          echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
-
-      - name: "Test Summary: Merge"
-        id: test_summary_merge
-        working-directory: "${{ github.workspace }}"
-        run: |
-          echo "Merging the following test result files: ${REPORTS_GEN}"
-          # Defaults to merging them into 'merged-test-results.xml'
-          junit-merge ${{ env.REPORTS_GEN }}
-
-      ############################################
-      #### AUTOMATIC TESTING ON PULL REQUESTS ####
-      ############################################
-
-      #### Run tests ####
-
-      - name: "PR - Run Tests"
-        id: pr_run_tests
-        working-directory: "${{ github.workspace }}"
-        if: github.event_name == 'pull_request_target'
-        run: |
-          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
-          # (Optional) Add more tests here.
-
-          # Merge test results with 'merged-test-results.xml' from above.
-          # junit-merge <new-test-results> merged-test-results.xml
-
-      #### Create test summary ####
-
-      - name: "PR - Test Summary"
-        id: pr_test_summary_create
-        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
-        with:
-          paths: "${{ github.workspace }}/merged-test-results.xml"
-          output: test-summary.md
-
-      - name: "PR - Upload Test Summary"
-        id: pr_test_summary_upload
-        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: test-summary
-          path: test-summary.md
-
-      #### Update PR request ####
-
-      - name: "PR - Update comment"
-        id: pr_update_comment
-        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
-        with:
-          filePath: test-summary.md
-
-      ########################
-      #### MANUAL TESTING ####
-      ########################
-
-      #### Run tests ####
-
-      - name: "Manual - Run Tests: Prep"
-        id: manual_run_tests
-        working-directory: "${{ github.workspace }}"
-        if: github.event_name == 'workflow_dispatch'
-        run: |
-          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
-
-          #TODO Use this when collection errors are resolved
-          # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
-
-          # (Optional) Add more tests here.
-
-          # Merge test results with 'merged-test-results.xml' from above.
-          # junit-merge <new-test-results> merged-test-results.xml
-
-      #### Create test summary ####
-
-      - name: "Manual - Test Summary"
-        id: manual_test_summary
-        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
-        with:
-          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -3,10 +3,10 @@ name: Installer CI
 on:
  pull_request:
    paths:
-      - 'install.sh'
+      - 'scripts/install.sh'
  push:
    paths:
-      - 'install.sh'
+      - 'scripts/install.sh'
  schedule:
    - cron: '0 2 * * *'  # every day at 02:00 UTC

@ -16,11 +16,11 @@ jobs:
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
-        run: shellcheck install.sh
+        run: shellcheck scripts/install.sh
  smoke-test:
    needs: lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run installer end-to-end
-        run: ./install.sh
+        run: ./scripts/install.sh
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -35,7 +35,7 @@ jobs:

      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
+        uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20

      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
@ -73,9 +73,12 @@ jobs:
          server:
            port: 8321
          EOF
-          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}", "token": "${{ env.TOKEN }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.type = "${{ matrix.auth-provider }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.tls_cafile = "${{ env.KUBERNETES_CA_CERT_PATH }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.issuer = "${{ env.KUBERNETES_ISSUER }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.audience = "${{ env.KUBERNETES_AUDIENCE }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.jwks.uri = "${{ env.KUBERNETES_API_SERVER_URL }}"' -i $run_dir/run.yaml
+          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
          cat $run_dir/run.yaml

          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -0,0 +1,70 @@
+name: SqlStore Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/providers/utils/sqlstore/**'
+      - 'tests/integration/sqlstore/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-sql-store-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-postgres:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12", "3.13"]
+      fail-fast: false
+
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_USER: llamastack
+          POSTGRES_PASSWORD: llamastack
+          POSTGRES_DB: llamastack
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Run SqlStore Integration Tests
+        env:
+          ENABLE_POSTGRES_TESTS: "true"
+          POSTGRES_HOST: localhost
+          POSTGRES_PORT: 5432
+          POSTGRES_DB: llamastack
+          POSTGRES_USER: llamastack
+          POSTGRES_PASSWORD: llamastack
+        run: |
+          uv run pytest -sv tests/integration/providers/utils/sqlstore/
+
+      - name: Upload test logs
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
+          path: |
+            *.log
+          retention-days: 1
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -7,27 +7,54 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - 'tests/integration/**'
+      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
+      - '.github/actions/setup-ollama/action.yml'
+  schedule:
+    - cron: '0 0 * * *'  # Daily at 12 AM UTC
+  workflow_dispatch:
+    inputs:
+      test-all-client-versions:
+        description: 'Test against both the latest and published versions'
+        type: boolean
+        default: false

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 jobs:
-  test-matrix:
+  discover-tests:
    runs-on: ubuntu-latest
+    outputs:
+      test-type: ${{ steps.generate-matrix.outputs.test-type }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate test matrix
+        id: generate-matrix
+        run: |
+          # Get test directories dynamically, excluding non-test directories
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
+
+  test-matrix:
+    needs: discover-tests
+    runs-on: ubuntu-latest
+
    strategy:
+      fail-fast: false
      matrix:
-        # Listing tests manually since some of them currently fail
-        # TODO: generate matrix list from tests/integration when fixed
-        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io]
+        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
        client-type: [library, server]
        python-version: ["3.12", "3.13"]
-      fail-fast: false # we want to run all tests regardless of failure
+        client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}

    steps:
      - name: Checkout repository
@ -37,6 +64,7 @@ jobs:
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python-version }}
+          client-version: ${{ matrix.client-version }}

      - name: Setup ollama
        uses: ./.github/actions/setup-ollama
@ -53,10 +81,15 @@ jobs:

      - name: Run Integration Tests
        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-          OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" # for library tests
-          ENABLE_OLLAMA: "ollama" # for library tests
+          OLLAMA_INFERENCE_MODEL: "llama3.2:3b-instruct-fp16" # for server tests
+          ENABLE_OLLAMA: "ollama" # for server tests
          OLLAMA_URL: "http://0.0.0.0:11434"
+          SAFETY_MODEL: "llama-guard3:1b"
+          LLAMA_STACK_CLIENT_TIMEOUT: "300" # Increased timeout for eval operations
+        # Use 'shell' to get pipefail behavior
+        # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
+        # TODO: write a precommit hook to detect if a test contains a pipe but does not use 'shell: bash'
+        shell: bash
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="starter"
@ -65,8 +98,9 @@ jobs:
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
-            --text-model="ollama/meta-llama/Llama-3.2-3B-Instruct" \
+            --text-model="ollama/llama3.2:3b-instruct-fp16" \
            --embedding-model=all-MiniLM-L6-v2 \
+            --safety-shield=$SAFETY_MODEL \
            --color=yes \
            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log

@ -85,7 +119,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -1,69 +0,0 @@
-name: auto-tests
-
-on:
-  # pull_request:
-  workflow_dispatch:
-    inputs:
-      commit_sha:
-        description: 'Specific Commit SHA to trigger on'
-        required: false
-        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
-
-jobs:
-  test-llama-stack-as-library:
-    runs-on: ubuntu-latest
-    env:
-      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
-      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
-      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
-    strategy:
-      matrix:
-        provider: [fireworks, together]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: ${{ github.event.inputs.commit_sha }}
-
-      - name: Echo commit SHA
-        run: |
-          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
-          git rev-parse HEAD
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt pytest
-          pip install -e .
-
-      - name: Build providers
-        run: |
-          llama stack build --template ${{ matrix.provider }} --image-type venv
-
-      - name: Install the latest llama-stack-client & llama-models packages
-        run: |
-          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
-          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
-
-      - name: Run client-sdk test
-        working-directory: "${{ github.workspace }}"
-        env:
-          REPORT_OUTPUT: md_report.md
-        shell: bash
-        run: |
-          pip install --upgrade pytest-md-report
-          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
-
-          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
-
-      - name: Output reports to the job summary
-        if: always()
-        shell: bash
-        run: |
-          if [ -f "$REPORT_FILE" ]; then
-            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            echo "</details>" >> $GITHUB_STEP_SUMMARY
-          fi
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -36,7 +36,7 @@ jobs:

      - name: Run unit tests
        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml

      - name: Upload test results
        if: always()
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -29,7 +29,7 @@ repos:
    -   id: check-toml

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.4
+    rev: v1.5.5
    hooks:
    -   id: insert-license
        files: \.py$|\.sh$
@ -38,7 +38,7 @@ repos:
          - docs/license_header.txt

 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.9.4
+    rev: v0.12.2
    hooks:
    -   id: ruff
        args: [ --fix ]
@ -46,14 +46,14 @@ repos:
    -   id: ruff-format

 -   repo: https://github.com/adamchainz/blacken-docs
-    rev: 1.19.0
+    rev: 1.19.1
    hooks:
    -   id: blacken-docs
        additional_dependencies:
        - black==24.3.0

 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.8
+    rev: 0.7.20
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -66,7 +66,7 @@ repos:
        ]

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.16.1
    hooks:
    -   id: mypy
        additional_dependencies:
@ -129,7 +129,28 @@ repos:
        require_serial: true
        always_run: true
        files: ^llama_stack/.*$
+      - id: forbid-pytest-asyncio
+        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
+        entry: bash
+        language: system
+        types: [python]
+        pass_filenames: true
+        args:
+          - -c
+          - |
+            grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
+              echo;
+              echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
+              echo "   pytest is already configured with async-mode=auto."
+              echo;
+              exit 1;
+            } || true

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
+    autofix_prs: true
+    autoupdate_branch: ''
+    autoupdate_schedule: weekly
+    skip: []
+    submodules: false
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -66,7 +66,7 @@ You can install the dependencies by running:

 ```bash
 cd llama-stack
-uv sync --extra dev
+uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```
@ -112,7 +112,7 @@ uv run pre-commit run --all-files

 ## Running tests

-You can find the Llama Stack testing documentation here [here](tests/README.md).
+You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).

 ## Adding a new dependency to the project

@ -168,7 +168,7 @@ manually as they are auto-generated.

 ### Updating the provider documentation

-If you have made changes to a provider's configuration, you should run `./scripts/distro_codegen.py`
+If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
 to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
 as they are auto-generated.
 Note that the provider "description" field will be used to generate the provider documentation.
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
+![coverage badge](./coverage.svg)

 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)

@ -77,7 +78,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:

 ```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | bash
+curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
 ```

 ### Overview
--- a/coverage.svg
+++ b/coverage.svg
@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
+    <linearGradient id="b" x2="0" y2="100%">
+        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
+        <stop offset="1" stop-opacity=".1"/>
+    </linearGradient>
+    <mask id="a">
+        <rect width="99" height="20" rx="3" fill="#fff"/>
+    </mask>
+    <g mask="url(#a)">
+        <path fill="#555" d="M0 0h63v20H0z"/>
+        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
+        <path fill="url(#b)" d="M0 0h99v20H0z"/>
+    </g>
+    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
+        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
+        <text x="31.5" y="14">coverage</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
+        <text x="80" y="14">44%</text>
+    </g>
+</svg>
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -11132,8 +11132,38 @@
                "title": "Trace"
            },
            "Checkpoint": {
-                "description": "Checkpoint created during training runs",
-                "title": "Checkpoint"
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "created_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "epoch": {
+                        "type": "integer"
+                    },
+                    "post_training_job_id": {
+                        "type": "string"
+                    },
+                    "path": {
+                        "type": "string"
+                    },
+                    "training_metrics": {
+                        "$ref": "#/components/schemas/PostTrainingMetric"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "created_at",
+                    "epoch",
+                    "post_training_job_id",
+                    "path"
+                ],
+                "title": "Checkpoint",
+                "description": "Checkpoint created during training runs"
            },
            "PostTrainingJobArtifactsResponse": {
                "type": "object",
@ -11156,6 +11186,31 @@
                "title": "PostTrainingJobArtifactsResponse",
                "description": "Artifacts of a finetuning job."
            },
+            "PostTrainingMetric": {
+                "type": "object",
+                "properties": {
+                    "epoch": {
+                        "type": "integer"
+                    },
+                    "train_loss": {
+                        "type": "number"
+                    },
+                    "validation_loss": {
+                        "type": "number"
+                    },
+                    "perplexity": {
+                        "type": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "epoch",
+                    "train_loss",
+                    "validation_loss",
+                    "perplexity"
+                ],
+                "title": "PostTrainingMetric"
+            },
            "PostTrainingJobStatusResponse": {
                "type": "object",
                "properties": {
@ -11285,6 +11340,9 @@
                    },
                    "embedding_dimension": {
                        "type": "integer"
+                    },
+                    "vector_db_name": {
+                        "type": "string"
                    }
                },
                "additionalProperties": false,
@ -13535,10 +13593,6 @@
                    "provider_id": {
                        "type": "string",
                        "description": "The ID of the provider to use for this vector store."
-                    },
-                    "provider_vector_db_id": {
-                        "type": "string",
-                        "description": "The provider-specific vector database ID."
                    }
                },
                "additionalProperties": false,
@ -14741,7 +14795,8 @@
                        "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
                    },
                    "mode": {
-                        "type": "string",
+                        "$ref": "#/components/schemas/RAGSearchMode",
+                        "default": "vector",
                        "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
                    },
                    "ranker": {
@ -14776,6 +14831,16 @@
                    }
                }
            },
+            "RAGSearchMode": {
+                "type": "string",
+                "enum": [
+                    "vector",
+                    "keyword",
+                    "hybrid"
+                ],
+                "title": "RAGSearchMode",
+                "description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
+            },
            "RRFRanker": {
                "type": "object",
                "properties": {
@ -15568,6 +15633,10 @@
                        "type": "string",
                        "description": "The identifier of the provider."
                    },
+                    "vector_db_name": {
+                        "type": "string",
+                        "description": "The name of the vector database."
+                    },
                    "provider_vector_db_id": {
                        "type": "string",
                        "description": "The identifier of the vector database in the provider."
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -7838,8 +7838,30 @@ components:
        - start_time
      title: Trace
    Checkpoint:
-      description: Checkpoint created during training runs
+      type: object
+      properties:
+        identifier:
+          type: string
+        created_at:
+          type: string
+          format: date-time
+        epoch:
+          type: integer
+        post_training_job_id:
+          type: string
+        path:
+          type: string
+        training_metrics:
+          $ref: '#/components/schemas/PostTrainingMetric'
+      additionalProperties: false
+      required:
+        - identifier
+        - created_at
+        - epoch
+        - post_training_job_id
+        - path
      title: Checkpoint
+      description: Checkpoint created during training runs
    PostTrainingJobArtifactsResponse:
      type: object
      properties:
@ -7855,6 +7877,24 @@ components:
        - checkpoints
      title: PostTrainingJobArtifactsResponse
      description: Artifacts of a finetuning job.
+    PostTrainingMetric:
+      type: object
+      properties:
+        epoch:
+          type: integer
+        train_loss:
+          type: number
+        validation_loss:
+          type: number
+        perplexity:
+          type: number
+      additionalProperties: false
+      required:
+        - epoch
+        - train_loss
+        - validation_loss
+        - perplexity
+      title: PostTrainingMetric
    PostTrainingJobStatusResponse:
      type: object
      properties:
@ -7944,6 +7984,8 @@ components:
          type: string
        embedding_dimension:
          type: integer
+        vector_db_name:
+          type: string
      additionalProperties: false
      required:
        - identifier
@ -9454,10 +9496,6 @@ components:
          type: string
          description: >-
            The ID of the provider to use for this vector store.
-        provider_vector_db_id:
-          type: string
-          description: >-
-            The provider-specific vector database ID.
      additionalProperties: false
      required:
        - name
@ -10306,7 +10344,8 @@ components:
            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
            {chunk.content}\nMetadata: {metadata}\n"
        mode:
-          type: string
+          $ref: '#/components/schemas/RAGSearchMode'
+          default: vector
          description: >-
            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
            "vector".
@ -10333,6 +10372,17 @@ components:
        mapping:
          default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
          llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
+    RAGSearchMode:
+      type: string
+      enum:
+        - vector
+        - keyword
+        - hybrid
+      title: RAGSearchMode
+      description: >-
+        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
+        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
+        - HYBRID: Combines both vector and keyword search for better results
    RRFRanker:
      type: object
      properties:
@ -10893,6 +10943,9 @@ components:
        provider_id:
          type: string
          description: The identifier of the provider.
+        vector_db_name:
+          type: string
+          description: The name of the vector database.
        provider_vector_db_id:
          type: string
          description: >-
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -55,7 +55,7 @@
        "\n",
        "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
        "# get meta url from llama.com\n",
-        "!uv run --with llama-stackllama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
+        "!uv run --with llama-stack llama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
        "\n",
        "model_id = f\"meta-llama/{MODEL}\""
      ]
--- a/rfcs/RFC-0001-llama-stack.md
+++ b/rfcs/RFC-0001-llama-stack.md
@ -1,5 +1,7 @@
 # The Llama Stack API

+*Originally authored Jul 23, 2024*
+
 **Authors:**

 * Meta: @raghotham, @ashwinb, @hjshah, @jspisak
@ -24,7 +26,7 @@ Meta releases weights of both the pretrained and instruction fine-tuned Llama mo

 ### Model Lifecycle

-![Figure 1: Model Life Cycle](../docs/resources/model-lifecycle.png)
+![Figure 1: Model Life Cycle](resources/model-lifecycle.png)

 For each of the operations that need to be performed (e.g. fine tuning, inference, evals etc) during the model life cycle, we identified the capabilities as toolchain APIs that are needed. Some of these capabilities are primitive operations like inference while other capabilities like synthetic data generation are composed of other capabilities. The list of APIs we have identified to support the lifecycle of Llama models is below:

@ -37,7 +39,7 @@ For each of the operations that need to be performed (e.g. fine tuning, inferenc

 ### Agentic System

-![Figure 2: Agentic System](../docs/resources/agentic-system.png)
+![Figure 2: Agentic System](resources/agentic-system.png)

 In addition to the model lifecycle, we considered the different components involved in an agentic system. Specifically around tool calling and shields. Since the model may decide to call tools, a single model inference call is not enough. What’s needed is an agentic loop consisting of tool calls and inference. The model provides separate tokens representing end-of-message and end-of-turn. A message represents a possible stopping point for execution where the model can inform the execution environment that a tool call needs to be made. The execution environment, upon execution, adds back the result to the context window and makes another inference call. This process can get repeated until an end-of-turn token is generated.
 Note that as of today, in the OSS world, such a “loop” is often coded explicitly via elaborate prompt engineering using a ReAct pattern (typically) or preconstructed execution graph. Llama 3.1 (and future Llamas) attempts to absorb this multi-step reasoning loop inside the main model itself.
@ -63,9 +65,9 @@ The sequence diagram that details the steps is [here](https://github.com/meta-ll

 We define the Llama Stack as a layer cake shown below.

-![Figure 3: Llama Stack](../docs/resources/llama-stack.png)
+![Figure 3: Llama Stack](resources/llama-stack.png)

-The API is defined in the [YAML](../docs/_static/llama-stack-spec.yaml) and [HTML](../docs/_static/llama-stack-spec.html) files.
+The API is defined in the [YAML](_static/llama-stack-spec.yaml) and [HTML](_static/llama-stack-spec.html) files.

 ## Sample implementations

--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -145,12 +145,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-        "!uv run --with llama-stack llama stack build --template ollama --image-type venv --image-name myvenv\n",
+        "!uv run --with llama-stack llama stack build --template starter --image-type venv\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"uv run --with llama-stack llama stack run ollama --image-type venv --image-name myvenv --env INFERENCE_MODEL=llama3.2:3b\",\n",
+        "        f\"uv run --with llama-stack llama stack run starter --image-type venv --env INFERENCE_MODEL=llama3.2:3b\",\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
@ -249,18 +249,23 @@
      ],
      "source": [
        "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
+        "import os\n",
+        "\n",
+        "os.environ[\"ENABLE_OLLAMA\"] = \"ollama\"\n",
+        "os.environ[\"OLLAMA_INFERENCE_MODEL\"] = \"llama3.2:3b\"\n",
+        "os.environ[\"OLLAMA_EMBEDDING_MODEL\"] = \"all-minilm:l6-v2\"\n",
+        "os.environ[\"OLLAMA_EMBEDDING_DIMENSION\"] = \"384\"\n",
        "\n",
        "vector_db_id = \"my_demo_vector_db\"\n",
        "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
        "\n",
        "models = client.models.list()\n",
        "\n",
-        "# Select the first LLM and first embedding models\n",
-        "model_id = next(m for m in models if m.model_type == \"llm\").identifier\n",
-        "embedding_model_id = (\n",
-        "    em := next(m for m in models if m.model_type == \"embedding\")\n",
-        ").identifier\n",
-        "embedding_dimension = em.metadata[\"embedding_dimension\"]\n",
+        "# Select the first ollama and first ollama's embedding model\n",
+        "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
+        "embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
+        "embedding_model_id = embedding_model.identifier\n",
+        "embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
        "\n",
        "_ = client.vector_dbs.register(\n",
        "    vector_db_id=vector_db_id,\n",
--- a/docs/source/advanced_apis/eval/index.md
+++ b/docs/source/advanced_apis/eval/index.md
@ -0,0 +1,6 @@
+# Eval Providers
+
+This section contains documentation for all available providers for the **eval** API.
+
+- [inline::meta-reference](inline_meta-reference.md)
+- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/eval/inline_meta-reference.md
+++ b/docs/source/advanced_apis/eval/inline_meta-reference.md
@ -0,0 +1,21 @@
+# inline::meta-reference
+
+## Description
+
+Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+
+## Sample Configuration
+
+```yaml
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
+
+```
+
--- a/docs/source/advanced_apis/eval/remote_nvidia.md
+++ b/docs/source/advanced_apis/eval/remote_nvidia.md
@ -0,0 +1,19 @@
+# remote::nvidia
+
+## Description
+
+NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
+
+## Sample Configuration
+
+```yaml
+evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
+
+```
+
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
--- a/docs/source/advanced_apis/index.md
+++ b/docs/source/advanced_apis/index.md
@ -0,0 +1,33 @@
+# Advanced APIs
+
+## Post-training
+Fine-tunes a model.
+
+```{toctree}
+:maxdepth: 1
+
+post_training/index
+```
+
+## Eval
+Generates outputs (via Inference or Agents) and perform scoring.
+
+```{toctree}
+:maxdepth: 1
+
+eval/index
+```
+
+```{include} evaluation_concepts.md
+:start-after: ## Evaluation Concepts
+```
+
+## Scoring
+Evaluates the outputs of the system.
+
+```{toctree}
+:maxdepth: 1
+
+scoring/index
+```
+
--- a/docs/source/advanced_apis/post_training/huggingface.md
+++ b/docs/source/advanced_apis/post_training/huggingface.md
--- a/docs/source/advanced_apis/post_training/index.md
+++ b/docs/source/advanced_apis/post_training/index.md
@ -0,0 +1,7 @@
+# Post_Training Providers
+
+This section contains documentation for all available providers for the **post_training** API.
+
+- [inline::huggingface](inline_huggingface.md)
+- [inline::torchtune](inline_torchtune.md)
+- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/post_training/inline_huggingface.md
+++ b/docs/source/advanced_apis/post_training/inline_huggingface.md
@ -0,0 +1,33 @@
+# inline::huggingface
+
+## Description
+
+HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `device` | `<class 'str'>` | No | cuda |  |
+| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
+| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
+| `chat_template` | `<class 'str'>` | No | |
+| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
+| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
+| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
+| `save_total_limit` | `<class 'int'>` | No | 3 |  |
+| `logging_steps` | `<class 'int'>` | No | 10 |  |
+| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
+| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
+| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
+| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: huggingface
+distributed_backend: null
+device: cpu
+
+```
+
--- a/docs/source/advanced_apis/post_training/inline_torchtune.md
+++ b/docs/source/advanced_apis/post_training/inline_torchtune.md
@ -0,0 +1,20 @@
+# inline::torchtune
+
+## Description
+
+TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `torch_seed` | `int \| None` | No |  |  |
+| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: meta
+
+```
+
--- a/docs/source/advanced_apis/post_training/nvidia_nemo.md
+++ b/docs/source/advanced_apis/post_training/nvidia_nemo.md
--- a/docs/source/advanced_apis/post_training/remote_nvidia.md
+++ b/docs/source/advanced_apis/post_training/remote_nvidia.md
@ -0,0 +1,28 @@
+# remote::nvidia
+
+## Description
+
+NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `api_key` | `str \| None` | No |  | The NVIDIA API key. |
+| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
+| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
+| `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
+| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
+| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
+| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
+
+## Sample Configuration
+
+```yaml
+api_key: ${env.NVIDIA_API_KEY:=}
+dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
+project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
+customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
+
+```
+
--- a/docs/source/advanced_apis/post_training/torchtune.md
+++ b/docs/source/advanced_apis/post_training/torchtune.md
--- a/docs/source/advanced_apis/scoring/index.md
+++ b/docs/source/advanced_apis/scoring/index.md
@ -0,0 +1,7 @@
+# Scoring Providers
+
+This section contains documentation for all available providers for the **scoring** API.
+
+- [inline::basic](inline_basic.md)
+- [inline::braintrust](inline_braintrust.md)
+- [inline::llm-as-judge](inline_llm-as-judge.md)
--- a/docs/source/advanced_apis/scoring/inline_basic.md
+++ b/docs/source/advanced_apis/scoring/inline_basic.md
@ -0,0 +1,13 @@
+# inline::basic
+
+## Description
+
+Basic scoring provider for simple evaluation metrics and scoring functions.
+
+## Sample Configuration
+
+```yaml
+{}
+
+```
+
--- a/docs/source/advanced_apis/scoring/inline_braintrust.md
+++ b/docs/source/advanced_apis/scoring/inline_braintrust.md
@ -0,0 +1,19 @@
+# inline::braintrust
+
+## Description
+
+Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `openai_api_key` | `str \| None` | No |  | The OpenAI API Key |
+
+## Sample Configuration
+
+```yaml
+openai_api_key: ${env.OPENAI_API_KEY:=}
+
+```
+
--- a/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
+++ b/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
@ -0,0 +1,13 @@
+# inline::llm-as-judge
+
+## Description
+
+LLM-as-judge scoring provider that uses language models to evaluate and score responses.
+
+## Sample Configuration
+
+```yaml
+{}
+
+```
+
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications (Examples)
+# AI Application Examples

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

@ -27,4 +27,5 @@ tools
 evals
 telemetry
 safety
+playground/index
 ```
--- a/docs/source/building_applications/playground/index.md
+++ b/docs/source/building_applications/playground/index.md
@ -1,4 +1,4 @@
-# Llama Stack Playground
+## Llama Stack Playground

 ```{note}
 The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
 - Demo **end-to-end** application code to help users get started to build their own applications
 - Provide an **UI** to help users inspect and understand Llama Stack API providers and resources

-## Key Features
+### Key Features

 #### Playground
 Interactive pages for users to play with and explore Llama Stack API capabilities.
@ -90,7 +90,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
  - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
  - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.

-## Starting the Llama Stack Playground
+### Starting the Llama Stack Playground

 To start the Llama Stack Playground, run the following commands:

--- a/docs/source/concepts/architecture.md
+++ b/docs/source/concepts/architecture.md
@ -1,31 +1,39 @@
-# Why Llama Stack?
+## Llama Stack architecture

-Building production AI applications today requires solving multiple challenges:
-
-**Infrastructure Complexity**
- Running large language models efficiently requires specialized infrastructure.
- Different deployment scenarios (local development, cloud, edge) need different solutions.
- Moving from development to production often requires significant rework.
-
-**Essential Capabilities**
- Safety guardrails and content filtering are necessary in an enterprise setting.
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
- Nearly any application needs composable multi-step workflows.
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
-
-**Lack of Flexibility and Choice**
- Directly integrating with multiple providers creates tight coupling.
- Different providers have different APIs and abstractions.
- Changing providers requires significant code changes.
-
-
-### Our Solution: A Universal Stack
+Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.

 ```{image} ../../_static/llama-stack.png
 :alt: Llama Stack
 :width: 400px
 ```

+### Benefits of Llama stack
+
+#### Current challenges in custom AI applications
+
+Building production AI applications today requires solving multiple challenges:
+
+**Infrastructure Complexity**
+
+- Running large language models efficiently requires specialized infrastructure.
+- Different deployment scenarios (local development, cloud, edge) need different solutions.
+- Moving from development to production often requires significant rework.
+
+**Essential Capabilities**
+
+- Safety guardrails and content filtering are necessary in an enterprise setting.
+- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
+- Nearly any application needs composable multi-step workflows.
+- Without monitoring, observability and evaluation, you end up operating in the dark.
+
+**Lack of Flexibility and Choice**
+
+- Directly integrating with multiple providers creates tight coupling.
+- Different providers have different APIs and abstractions.
+- Changing providers requires significant code changes.
+
+#### Our Solution: A Universal Stack
+
 Llama Stack addresses these challenges through a service-oriented, API-first approach:

 **Develop Anywhere, Deploy Everywhere**
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -2,6 +2,10 @@

 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.

+```{include} architecture.md
+:start-after: ## Llama Stack architecture
+```
+
 ```{include} apis.md
 :start-after: ## APIs
 ```
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
 :start-after: ## API Providers
 ```

-```{include} resources.md
-:start-after: ## Resources
-```
-
 ```{include} distributions.md
 :start-after: ## Distributions
 ```

-```{include} evaluation_concepts.md
-:start-after: ## Evaluation Concepts
+```{include} resources.md
+:start-after: ## Resources
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -52,7 +52,18 @@ extensions = [
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
+    "sphinx_reredirects"
 ]
+
+redirects = {
+    "providers/post_training/index": "../../advanced_apis/post_training/index.html",
+    "providers/eval/index": "../../advanced_apis/eval/index.html",
+    "providers/scoring/index": "../../advanced_apis/scoring/index.html",
+    "playground/index": "../../building_applications/playground/index.html",
+    "openai/index": "../../providers/index.html#openai-api-compatibility",
+    "introduction/index": "../concepts/index.html#llama-stack-architecture"
+}
+
 myst_enable_extensions = ["colon_fence"]

 html_theme = "sphinx_rtd_theme"
--- a/docs/source/deploying/index.md
+++ b/docs/source/deploying/index.md
@ -0,0 +1,4 @@
+# Deployment Examples
+
+```{include} kubernetes_deployment.md
+```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,10 +1,12 @@
-# Kubernetes Deployment Guide
+## Kubernetes Deployment Guide

 Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.

 ### Prerequisites
 In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.

+Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details.
+
 First, create a local Kubernetes cluster via Kind:

 ```
@ -217,3 +219,29 @@ Finally, we forward the Kubernetes service to a local port and test some inferen
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```
+
+## Deploying Llama Stack Server in AWS EKS
+
+We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster.
+
+Prerequisites:
+- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html).
+- Create a [Github OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) and get the client ID and client secret.
+  - Set the `Authorization callback URL` to `http://<your-llama-stack-ui-url>/api/auth/callback/`
+
+
+Run the following script to deploy the Llama Stack server:
+```
+export HF_TOKEN=<your-huggingface-token>
+export GITHUB_CLIENT_ID=<your-github-client-id>
+export GITHUB_CLIENT_SECRET=<your-github-client-secret>
+export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
+
+cd docs/source/distributions/eks
+./apply.sh
+```
+
+This script will:
+
+- Set up a default storage class for AWS EKS
+- Deploy the Llama Stack server in a Kubernetes Pod and Service
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -145,6 +145,10 @@ $ llama stack build --template starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
+
+```{tip}
+The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
 :::
 :::{tab-item} Building from Scratch

@ -393,17 +397,17 @@ llama stack list
 ```

 ```
------------------------------+-----------------------------------------------------------------------------+--------------+------------+
-| Stack Name                  | Path                                                                        | Build Config | Run Config |
-+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
-| together                    | /home/wenzhou/.llama/distributions/together                                 | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
-| bedrock                     | /home/wenzhou/.llama/distributions/bedrock                                  | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
-| starter                     | /home/wenzhou/.llama/distributions/starter                                  | No           | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
-| remote-vllm                 | /home/wenzhou/.llama/distributions/remote-vllm                              | Yes          | Yes        |
-+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
+------------------------------+-----------------------------------------------------------------+--------------+------------+
+| Stack Name                  | Path                                                            | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| together                    | ~/.llama/distributions/together                                 | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
 ```

 ### Removing a Distribution
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -2,6 +2,10 @@

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

+```{note}
+The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
+
 ```{dropdown} 👋 Click here for a Sample Configuration File

 ```yaml
@ -56,8 +60,8 @@ shields: []
 server:
  port: 8321
  auth:
-    provider_type: "oauth2_token"
-    config:
+    provider_config:
+      type: "oauth2_token"
      jwks:
        uri: "https://my-token-issuing-svc.com/jwks"
 ```
@ -226,6 +230,8 @@ server:

 ### Authentication Configuration

+> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
+
 The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:

 ```
@ -240,8 +246,8 @@ The server can be configured to use service account tokens for authorization, va
 ```yaml
 server:
  auth:
-    provider_type: "oauth2_token"
-    config:
+    provider_config:
+      type: "oauth2_token"
      jwks:
        uri: "https://kubernetes.default.svc:8443/openid/v1/jwks"
        token: "${env.TOKEN:+}"
@ -325,13 +331,25 @@ You can easily validate a request by running:
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```

+#### GitHub Token Provider
+Validates GitHub personal access tokens or OAuth tokens directly:
+```yaml
+server:
+  auth:
+    provider_config:
+      type: "github_token"
+      github_api_base_url: "https://api.github.com"  # Or GitHub Enterprise URL
+```
+
+The provider fetches user information from GitHub and maps it to access attributes based on the `claims_mapping` configuration.
+
 #### Custom Provider
 Validates tokens against a custom authentication endpoint:
 ```yaml
 server:
  auth:
-    provider_type: "custom"
-    config:
+    provider_config:
+      type: "custom"
      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
 ```

@ -416,8 +434,8 @@ clients.
 server:
  port: 8321
  auth:
-    provider_type: custom
-    config:
+    provider_config:
+      type: custom
      endpoint: https://auth.example.com/validate
  quota:
    kvstore:
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -0,0 +1,40 @@
+# Customizing run.yaml Files
+
+The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
+
+## Key Points
+
+- **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
+- **Customization expected**: Update URLs, credentials, models, and settings for your environment
+- **Version control separately**: Keep customized configs in your own repository
+- **Environment-specific**: Create different configurations for dev, staging, production
+
+## What You Can Customize
+
+You can customize:
+- **Provider endpoints**: Change `http://localhost:8000` to your actual servers
+- **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
+- **Storage paths**: Move from `/tmp/` to production directories
+- **Authentication**: Add API keys, SSL, timeouts
+- **Models**: Different model sizes for dev vs prod
+- **Database settings**: Switch from SQLite to PostgreSQL
+- **Tool configurations**: Add custom tools and integrations
+
+## Best Practices
+
+- Use environment variables for secrets and environment-specific values
+- Create separate `run.yaml` files for different environments (dev, staging, prod)
+- Document your changes with comments
+- Test configurations before deployment
+- Keep your customized configs in version control
+
+Example structure:
+```
+your-project/
+├── configs/
+│   ├── dev-run.yaml
+│   ├── prod-run.yaml
+└── README.md
+```
+
+The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+K8S_DIR="${SCRIPT_DIR}/../k8s"
+
+echo "Setting up AWS EKS-specific storage class..."
+kubectl apply -f gp3-topology-aware.yaml
+
+echo "Running main Kubernetes deployment..."
+cd "${K8S_DIR}"
+./apply.sh "$@"
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
@ -0,0 +1,15 @@
+# Set up default storage class on AWS EKS
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: gp3-topology-aware
+  annotations:
+    storageclass.kubernetes.io/is-default-class: "true"
+parameters:
+  type: gp3
+  iops: "3000"
+  throughput: "125"
+provisioner: ebs.csi.aws.com
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+allowVolumeExpansion: true
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -6,13 +6,9 @@ This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
 :maxdepth: 3
-
+list_of_distributions
+building_distro
+customizing_run_yaml
 importing_as_library
 configuration
-list_of_distributions
-kubernetes_deployment
-building_distro
-on_device_distro
-remote_hosted_distro
-self_hosted_distro
 ```
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -6,16 +6,47 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-export POSTGRES_USER=${POSTGRES_USER:-llamastack}
-export POSTGRES_DB=${POSTGRES_DB:-llamastack}
-export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
+export POSTGRES_USER=llamastack
+export POSTGRES_DB=llamastack
+export POSTGRES_PASSWORD=llamastack
+
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
+# HF_TOKEN should be set by the user; base64 encode it for the secret
+if [ -n "${HF_TOKEN:-}" ]; then
+  export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
+else
+  echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face."
+  exit 1
+fi
+
+if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
+  echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
+  echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
+  echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+

-export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}

 set -euo pipefail
 set -x

+# Apply the HF token secret if HF_TOKEN is provided
+if [ -n "${HF_TOKEN:-}" ]; then
+  envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
+fi
+
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template
@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: ${HF_TOKEN_BASE64}
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -22,10 +22,10 @@ data:
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
-          url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
-          api_token: ${env.VLLM_API_TOKEN:fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -33,7 +33,7 @@ data:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
-          url: ${env.CHROMADB_URL:+}
+          url: ${env.CHROMADB_URL:=}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -48,7 +48,7 @@ data:
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
@ -61,8 +61,8 @@ data:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
-          service_name: ${env.OTEL_SERVICE_NAME:+}
-          sinks: ${env.TELEMETRY_SINKS:console}
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
@ -122,6 +122,9 @@ data:
      provider_id: rag-runtime
    server:
      port: 8321
+      auth:
+        provider_config:
+          type: github_token
 kind: ConfigMap
 metadata:
  creationTimestamp: null
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -27,7 +27,7 @@ spec:
    spec:
      containers:
      - name: llama-stack
-        image: llamastack/distribution-remote-vllm:latest
+        image: llamastack/distribution-starter:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -30,7 +30,7 @@ providers:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
-      url: ${env.CHROMADB_URL:+}
+      url: ${env.CHROMADB_URL:=}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -58,8 +58,8 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:+console}
-      sinks: ${env.TELEMETRY_SINKS:+console}
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:=console}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -119,3 +119,6 @@ tool_groups:
  provider_id: rag-runtime
 server:
  port: 8321
+  auth:
+    provider_config:
+      type: github_token
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -26,6 +26,12 @@ spec:
          value: "http://llama-stack-service:8321"
        - name: LLAMA_STACK_UI_PORT
          value: "8322"
+        - name: GITHUB_CLIENT_ID
+          value: "${GITHUB_CLIENT_ID}"
+        - name: GITHUB_CLIENT_SECRET
+          value: "${GITHUB_CLIENT_SECRET}"
+        - name: NEXTAUTH_URL
+          value: "${LLAMA_STACK_UI_URL}:8322"
        args:
          - -c
          - |
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -25,23 +25,17 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
        args:
-        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
+        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic"
        env:
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -49,6 +43,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@ -6,7 +6,6 @@ spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
-  storageClassName: gp2
  resources:
    requests:
      storage: 30Gi
@ -26,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm-safety
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm-safety
        image: vllm/vllm-openai:latest
@ -44,6 +35,8 @@ spec:
          "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
        ]
        env:
+        - name: SAFETY_MODEL
+          value: "${SAFETY_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -51,6 +44,11 @@ spec:
              key: token
        ports:
          - containerPort: 8001
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/list_of_distributions.md
+++ b/docs/source/distributions/list_of_distributions.md
@ -39,6 +39,13 @@ docker pull llama-stack/distribution-meta-reference-gpu

 **Guides:** [Meta Reference GPU Guide](self_hosted_distro/meta-reference-gpu)

+### 🖥️ Self-Hosted with NVIDA NeMo Microservices
+
+**Use `nvidia` if you:**
+- Want to use Llama Stack with NVIDIA NeMo Microservices
+
+**Guides:** [NVIDIA Distribution Guide](self_hosted_distro/nvidia)
+
 ### ☁️ Managed Hosting

 **Use remote-hosted endpoints if you:**
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -13,7 +13,7 @@ Latest Release Notes: [link](https://github.com/meta-llama/llama-stack-client-ko
 *Tagged releases are stable versions of the project. While we strive to maintain a stable main branch, it's not guaranteed to be free of bugs or issues.*

 ## Android Demo App
-Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/examples/android_app)
+Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app)

 The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments.

@ -68,7 +68,7 @@ Ensure the Llama Stack server version is the same as the Kotlin SDK Library for

 Other inference providers: [Table](https://llama-stack.readthedocs.io/en/latest/index.html#supported-llama-stack-implementations)

-How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#settings)
+How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#settings)

 ### Initialize the Client
 A client serves as the primary interface for interacting with a specific inference type and its associated parameters. Only after client is initialized then you can configure and start inferences.
@ -135,7 +135,7 @@ val result = client!!.inference().chatCompletionStreaming(

 ### Setup Custom Tool Calling

-Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling)
+Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#tool-calling)

 ## Advanced Users

--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -0,0 +1,177 @@
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# NVIDIA Distribution
+
+The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `inline::localfs`, `remote::nvidia` |
+| eval | `remote::nvidia` |
+| inference | `remote::nvidia` |
+| post_training | `remote::nvidia` |
+| safety | `remote::nvidia` |
+| scoring | `inline::basic` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `inline::rag-runtime` |
+| vector_io | `inline::faiss` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
+- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
+- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
+- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
+- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
+- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
+- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
+- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
+- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
+- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
+
+### Models
+
+The following models are available by default:
+
+- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
+- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
+- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
+- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
+- `nvidia/nv-embedqa-e5-v5 `
+- `nvidia/nv-embedqa-mistral-7b-v2 `
+- `snowflake/arctic-embed-l `
+
+
+## Prerequisites
+### NVIDIA API Keys
+
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
+
+### Deploy NeMo Microservices Platform
+The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
+
+## Supported Services
+Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
+
+### Inference: NVIDIA NIM
+NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
+  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
+  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
+
+The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
+
+### Datasetio API: NeMo Data Store
+The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
+
+See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
+
+### Eval API: NeMo Evaluator
+The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
+
+### Post-Training API: NeMo Customizer
+The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
+
+### Safety API: NeMo Guardrails
+The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
+
+## Deploying models
+In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
+
+Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
+```sh
+# URL to NeMo NIM Proxy service
+export NEMO_URL="http://nemo.test"
+
+curl --location "$NEMO_URL/v1/deployment/model-deployments" \
+   -H 'accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "name": "llama-3.2-1b-instruct",
+      "namespace": "meta",
+      "config": {
+         "model": "meta/llama-3.2-1b-instruct",
+         "nim_deployment": {
+            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
+            "image_tag": "1.8.3",
+            "pvc_size": "25Gi",
+            "gpu": 1,
+            "additional_envs": {
+               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
+            }
+         }
+      }
+   }'
+```
+This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
+
+You can also remove a deployed NIM to free up GPU resources, if needed.
+```sh
+export NEMO_URL="http://nemo.test"
+
+curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
+```
+
+## Running Llama Stack with NVIDIA
+
+You can do this via Conda or venv (build code), or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-nvidia \
+  --config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+```
+
+### Via Conda
+
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+llama stack build --template nvidia --image-type conda
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+llama stack build --template nvidia --image-type venv
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
+
+## Example Notebooks
+For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -17,18 +17,18 @@ The `llamastack/distribution-starter` distribution is a comprehensive, multi-pro

 The starter distribution consists of the following provider configurations:

-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| files | `inline::localfs` |
+| API | Provider(s)                                                                                                                                                                                                                                                                                                                                    |
+|-----|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| agents | `inline::meta-reference`                                                                                                                                                                                                                                                                                                                       |
+| datasetio | `remote::huggingface`, `inline::localfs`                                                                                                                                                                                                                                                                                                       |
+| eval | `inline::meta-reference`                                                                                                                                                                                                                                                                                                                       |
+| files | `inline::localfs`                                                                                                                                                                                                                                                                                                                              |
 | inference | `remote::openai`, `remote::fireworks`, `remote::together`, `remote::ollama`, `remote::anthropic`, `remote::gemini`, `remote::groq`, `remote::sambanova`, `remote::vllm`, `remote::tgi`, `remote::cerebras`, `remote::llama-openai-compat`, `remote::nvidia`, `remote::hf::serverless`, `remote::hf::endpoint`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss`, `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
+| safety | `inline::llama-guard`                                                                                                                                                                                                                                                                                                                          |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust`                                                                                                                                                                                                                                                                                  |
+| telemetry | `inline::meta-reference`                                                                                                                                                                                                                                                                                                                       |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`                                                                                                                                                                                                                                       |
+| vector_io | `inline::faiss`, `inline::sqlite-vec`, `inline::milvus`, `remote::chromadb`, `remote::pgvector`                                                                                                                                                                                                                                                 |

 ## Inference Providers

@ -167,7 +167,7 @@ When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`),

 ## Running the Distribution

-You can run the starter distribution via Docker or Conda.
+You can run the starter distribution via Docker, Conda, or venv.

 ### Via Docker

@ -186,17 +186,12 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

-### Via Conda
+### Via Conda or venv

-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+Ensure you have configured the starter distribution using the environment variables explained above.

 ```bash
-llama stack build --template starter --image-type conda
-llama stack run distributions/starter/run.yaml \
-  --port 8321 \
-  --env OPENAI_API_KEY=your_openai_key \
-  --env FIREWORKS_API_KEY=your_fireworks_key \
-  --env TOGETHER_API_KEY=your_together_key
+uv run --with llama-stack llama stack build --template starter --image-type <conda|venv> --run
 ```

 ## Example Usage
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -28,5 +28,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste

 importing_as_library
 configuration
-kubernetes_deployment
 ```
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -1,4 +1,4 @@
-# Detailed Tutorial
+## Detailed Tutorial

 In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
 A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
@ -10,7 +10,7 @@ Llama Stack is a stateful service with REST APIs to support seamless transition
 In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
 as the inference [provider](../providers/index.md#inference) for a Llama Model.

-## Step 1: Installation and Setup
+### Step 1: Installation and Setup

 Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
 download Llama 3.2 3B model, and then start the Ollama service.
@ -42,10 +42,10 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
 Setup your virtual environment.

 ```bash
-uv sync --python 3.10
+uv sync --python 3.12
 source .venv/bin/activate
 ```
-## Step 2:  Run Llama Stack
+### Step 2:  Run Llama Stack
 Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.

 ::::{tab-set}
@ -54,11 +54,12 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
 You can use Python to build and run the Llama Stack server, which is useful for testing and development.

 Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
+which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
 Now let's build and run the Llama Stack config for Ollama.
+We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.

 ```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type venv --run
+ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type venv --run
 ```
 :::
 :::{tab-item} Using `conda`
@ -69,17 +70,18 @@ which defines the providers and their settings.
 Now let's build and run the Llama Stack config for Ollama.

 ```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type conda  --image-name llama3-3b-conda --run
+ENABLE_OLLAMA=ollama INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type conda --run
 ```
 :::
 :::{tab-item} Using a Container
 You can use a container image to run the Llama Stack server. We provide several container images for the server
 component that works with different inference providers out of the box. For this guide, we will use
-`llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the
-configurations, please check out [this guide](../references/index.md).
+`llamastack/distribution-starter` as the container image. If you'd like to build your own image or customize the
+configurations, please check out [this guide](../distributions/building_distro.md).
 First lets setup some environment variables and create a local directory to mount into the container’s file system.
 ```bash
 export INFERENCE_MODEL="llama3.2:3b"
+export ENABLE_OLLAMA=ollama
 export LLAMA_STACK_PORT=8321
 mkdir -p ~/.llama
 ```
@ -90,7 +92,7 @@ docker run -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  llamastack/distribution-ollama \
+  llamastack/distribution-starter \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
@ -112,7 +114,7 @@ docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  --network=host \
-  llamastack/distribution-ollama \
+  llamastack/distribution-starter \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://localhost:11434
@ -130,7 +132,7 @@ Now you can use the Llama Stack client to run inference and build agents!
 You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
 Note that the client package is already included in the `llama-stack` package.

-## Step 3: Run Client CLI
+### Step 3: Run Client CLI

 Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
 existing server virtual environment.
@ -146,7 +148,7 @@ source .venv/bin/activate

 :::{tab-item} Install with `venv`
 ```bash
-uv venv client --python 3.10
+uv venv client --python 3.12
 source client/bin/activate
 pip install llama-stack-client
 ```
@ -154,7 +156,7 @@ pip install llama-stack-client

 :::{tab-item} Install with `conda`
 ```bash
-yes | conda create -n stack-client python=3.10
+yes | conda create -n stack-client python=3.12
 conda activate stack-client
 pip install llama-stack-client
 ```
@ -177,41 +179,60 @@ List the models
 llama-stack-client models list
 Available Models

-┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
-┃ model_type      ┃ identifier                          ┃ provider_resource_id                ┃ metadata                                  ┃ provider_id     ┃
-┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
-│ embedding       │ all-MiniLM-L6-v2                    │ all-minilm:latest                   │ {'embedding_dimension': 384.0}            │ ollama          │
-├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼─────────────────┤
-│ llm             │ llama3.2:3b                         │ llama3.2:3b                         │                                           │ ollama          │
-└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴─────────────────┘
-
-Total models: 2
+┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ model_type      ┃ identifier                          ┃ provider_resource_id                ┃ metadata                                  ┃ provider_id           ┃
+┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
+│ embedding       │ ollama/all-minilm:l6-v2             │ all-minilm:l6-v2                    │ {'embedding_dimension': 384.0}            │ ollama                │
+├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
+│ ...             │ ...                                 │ ...                                 │                                           │ ...                   │
+├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
+│ llm             │ ollama/Llama-3.2:3b                 │ llama3.2:3b                         │                                           │ ollama                │
+└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────┘

 ```
 You can test basic Llama inference completion using the CLI.

 ```bash
-llama-stack-client inference chat-completion --message "tell me a joke"
+llama-stack-client inference chat-completion --model-id "ollama/llama3.2:3b" --message "tell me a joke"
+
 ```
 Sample output:
 ```python
-ChatCompletionResponse(
-    completion_message=CompletionMessage(
-        content="Here's one:\n\nWhat do you call a fake noodle?\n\nAn impasta!",
-        role="assistant",
-        stop_reason="end_of_turn",
-        tool_calls=[],
-    ),
-    logprobs=None,
-    metrics=[
-        Metric(metric="prompt_tokens", value=14.0, unit=None),
-        Metric(metric="completion_tokens", value=27.0, unit=None),
-        Metric(metric="total_tokens", value=41.0, unit=None),
+OpenAIChatCompletion(
+    id="chatcmpl-08d7b2be-40f3-47ed-8f16-a6f29f2436af",
+    choices=[
+        OpenAIChatCompletionChoice(
+            finish_reason="stop",
+            index=0,
+            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                role="assistant",
+                content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired.",
+                name=None,
+                tool_calls=None,
+                refusal=None,
+                annotations=None,
+                audio=None,
+                function_call=None,
+            ),
+            logprobs=None,
+        )
    ],
+    created=1751725254,
+    model="llama3.2:3b",
+    object="chat.completion",
+    service_tier=None,
+    system_fingerprint="fp_ollama",
+    usage={
+        "completion_tokens": 18,
+        "prompt_tokens": 29,
+        "total_tokens": 47,
+        "completion_tokens_details": None,
+        "prompt_tokens_details": None,
+    },
 )
 ```

-## Step 4: Run the Demos
+### Step 4: Run the Demos

 Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
 Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
@ -221,7 +242,7 @@ Other SDKs are also available, please refer to the [Client SDK](../index.md#clie
 :::{tab-item} Basic Inference
 Now you can run inference using the Llama Stack client SDK.

-### i. Create the Script
+#### i. Create the Script

 Create a file `inference.py` and add the following code:
 ```python
@ -233,40 +254,36 @@ client = LlamaStackClient(base_url="http://localhost:8321")
 models = client.models.list()

 # Select the first LLM
-llm = next(m for m in models if m.model_type == "llm")
+llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
 model_id = llm.identifier

 print("Model:", model_id)

-response = client.inference.chat_completion(
-    model_id=model_id,
+response = client.chat.completions.create(
+    model=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a haiku about coding"},
    ],
 )
-print(response.completion_message.content)
+print(response)
 ```

-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python inference.py
 ```
 Which will output:
 ```
-Model: llama3.2:3b
-Here is a haiku about coding:
-
-Lines of code unfold
-Logic flows through digital night
-Beauty in the bits
+Model: ollama/llama3.2:3b
+OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices=[OpenAIChatCompletionChoice(finish_reason='stop', index=0, message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(role='assistant', content="Lines of code unfold\nAlgorithms dance with ease\nLogic's gentle kiss", name=None, tool_calls=None, refusal=None, annotations=None, audio=None, function_call=None), logprobs=None)], created=1751732480, model='llama3.2:3b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage={'completion_tokens': 16, 'prompt_tokens': 37, 'total_tokens': 53, 'completion_tokens_details': None, 'prompt_tokens_details': None})
 ```
 :::

 :::{tab-item} Build a Simple Agent
 Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
-### i. Create the Script
+#### i. Create the Script
 Create a file `agent.py` and add the following code:

 ```python
@ -278,7 +295,7 @@ import uuid
 client = LlamaStackClient(base_url=f"http://localhost:8321")

 models = client.models.list()
-llm = next(m for m in models if m.model_type == "llm")
+llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
 model_id = llm.identifier

 agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")
@ -315,19 +332,20 @@ uv run python agent.py

 ```{dropdown} 👋 Click here to see the sample output
    Non-streaming ...
-    agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I'm here to provide information, answer questions, and help with tasks to the best of my abilities.
+    agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I can provide information, answer questions, and help with tasks to the best of my abilities.

-    I can be used for a wide range of purposes, such as:
+    I'm a large language model, which means I've been trained on a massive dataset of text from various sources, allowing me to understand and respond to a wide range of topics and questions. My purpose is to provide helpful and accurate information, and I'm constantly learning and improving my responses based on the interactions I have with users like you.

+    I can help with:
+
+    * Answering questions on various subjects
    * Providing definitions and explanations
    * Offering suggestions and ideas
-    * Helping with language translation
-    * Assisting with writing and proofreading
-    * Generating text or responses to questions
-    * Playing simple games or chatting about topics of interest
-
-    I'm constantly learning and improving my abilities, so feel free to ask me anything, and I'll do my best to help!
+    * Assisting with language-related tasks, such as proofreading and editing
+    * Generating text and content
+    * And more!

+    Feel free to ask me anything, and I'll do my best to help!
    Streaming ...
    AgentTurnResponseStreamChunk(
    │   event=TurnResponseEvent(
@ -421,15 +439,15 @@ uv run python agent.py


    Streaming with print helper...
-    inference> Déjà vu!
+    inference> Déjà vu! You're asking me again!

-    As I mentioned earlier, I'm an artificial intelligence language model. I don't have a personal identity or consciousness like humans do. I exist solely to process and respond to text-based inputs, providing information and assistance on a wide range of topics.
+    As I mentioned earlier, I'm a computer program designed to simulate conversation and answer questions. I don't have a personal identity or consciousness like a human would. I exist solely as a digital entity, running on computer servers and responding to inputs from users like you.

-    I'm a computer program designed to simulate human-like conversations, using natural language processing (NLP) and machine learning algorithms to understand and generate responses. My purpose is to help users like you with their questions, provide information, and engage in conversation.
+    I'm a type of artificial intelligence (AI) called a large language model, which means I've been trained on a massive dataset of text from various sources. This training allows me to understand and respond to a wide range of questions and topics.

-    Think of me as a virtual companion, a helpful tool designed to make your interactions more efficient and enjoyable. I don't have personal opinions, emotions, or biases, but I'm here to provide accurate and informative responses to the best of my abilities.
+    My purpose is to provide helpful and accurate information, answer questions, and assist users like you with tasks and conversations. I don't have personal preferences, emotions, or opinions like humans do. My goal is to be informative, neutral, and respectful in my responses.

-    So, who am I? I'm just a computer program designed to help you!
+    So, that's me in a nutshell!
 ```
 :::

@ -437,7 +455,7 @@ uv run python agent.py

 For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
 in a vector database.
-### i. Create the Script
+#### i. Create the Script
 Create a file `rag_agent.py` and add the following code:

 ```python
@ -483,7 +501,11 @@ client.tool_runtime.rag_tool.insert(
 )

 # Get the model being served
-llm = next(m for m in client.models.list() if m.model_type == "llm")
+llm = next(
+    m
+    for m in client.models.list()
+    if m.model_type == "llm" and m.provider_id == "ollama"
+)
 model = llm.identifier

 # Create the RAG agent
@ -511,7 +533,7 @@ for t in turns:
    for event in AgentEventLogger().log(stream):
        event.print()
 ```
-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python rag_agent.py
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,123 +1,13 @@
-# Quickstart
+# Getting Started

-Get started with Llama Stack in minutes!
-
-Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
-environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
-
-In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/inference/index) for a Llama Model.
-
-**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
-
-#### Step 1: Install and setup
-1. Install [uv](https://docs.astral.sh/uv/)
-2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
-```bash
-ollama run llama3.2:3b --keepalive 60m
+```{include} quickstart.md
+:start-after: ## Quickstart
 ```
-#### Step 2: Run the Llama Stack server
-We will use `uv` to run the Llama Stack server.
-```bash
-INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
+
+```{include} libraries.md
+:start-after: ## Libraries (SDKs)
 ```
-#### Step 3: Run the demo
-Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python
-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
-
-vector_db_id = "my_demo_vector_db"
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-models = client.models.list()
-
-# Select the first LLM and first embedding models
-model_id = next(m for m in models if m.model_type == "llm").identifier
-embedding_model_id = (
-    em := next(m for m in models if m.model_type == "embedding")
-).identifier
-embedding_dimension = em.metadata["embedding_dimension"]
-
-_ = client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model=embedding_model_id,
-    embedding_dimension=embedding_dimension,
-    provider_id="faiss",
-)
-source = "https://www.paulgraham.com/greatwork.html"
-print("rag_tool> Ingesting document:", source)
-document = RAGDocument(
-    document_id="document_1",
-    content=source,
-    mime_type="text/html",
-    metadata={},
-)
-client.tool_runtime.rag_tool.insert(
-    documents=[document],
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
-)
-agent = Agent(
-    client,
-    model=model_id,
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
-        }
-    ],
-)
-
-prompt = "How do you do great work?"
-print("prompt>", prompt)
-
-response = agent.create_turn(
-    messages=[{"role": "user", "content": prompt}],
-    session_id=agent.create_session("rag_session"),
-    stream=True,
-)
-
-for log in AgentEventLogger().log(response):
-    log.print()
+```{include} detailed_tutorial.md
+:start-after: ## Detailed Tutorial
 ```
-We will use `uv` to run the script
-```
-uv run --with llama-stack-client,fire,requests demo_script.py
-```
-And you should see output like below.
-```
-rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
-
-prompt> How do you do great work?
-
-inference> [knowledge_search(query="What is the key to doing great work")]
-
-tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
-
-tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
-
-inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
-
-To further clarify, I would suggest that doing great work involves:
-
-* Completing tasks with high quality and attention to detail
-* Expanding on existing knowledge or ideas
-* Making a positive impact on others through your work
-* Striving for excellence and continuous improvement
-
-Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
-```
-Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
-
-## Next Steps
-
-Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
--- a/docs/source/getting_started/libraries.md
+++ b/docs/source/getting_started/libraries.md
@ -0,0 +1,10 @@
+## Libraries (SDKs)
+
+We have a number of client-side SDKs available for different languages.
+
+|  **Language** |  **Client SDK** | **Package** |
+| :----: | :----: | :----: |
+| Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
+| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
+| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
+| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -0,0 +1,129 @@
+## Quickstart
+
+Get started with Llama Stack in minutes!
+
+Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
+environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
+
+In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
+as the inference [provider](../providers/inference/index) for a Llama Model.
+
+**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
+
+#### Step 1: Install and setup
+1. Install [uv](https://docs.astral.sh/uv/)
+2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
+```bash
+ollama run llama3.2:3b --keepalive 60m
+```
+#### Step 2: Run the Llama Stack server
+We will use `uv` to run the Llama Stack server.
+```bash
+ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
+```
+#### Step 3: Run the demo
+Now open up a new terminal and copy the following script into a file named `demo_script.py`.
+
+```python
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
+
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+_ = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=50,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
+)
+
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=True,
+)
+
+for log in AgentEventLogger().log(response):
+    log.print()
+```
+We will use `uv` to run the script
+```
+uv run --with llama-stack-client,fire,requests demo_script.py
+```
+And you should see output like below.
+```
+rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
+
+prompt> How do you do great work?
+
+inference> [knowledge_search(query="What is the key to doing great work")]
+
+tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
+
+tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
+
+inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
+
+To further clarify, I would suggest that doing great work involves:
+
+* Completing tasks with high quality and attention to detail
+* Expanding on existing knowledge or ideas
+* Making a positive impact on others through your work
+* Striving for excellence and continuous improvement
+
+Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
+```
+Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
+
+```{admonition} HuggingFace access
+:class: tip
+
+If you are getting a **401 Client Error** from HuggingFace for the **all-MiniLM-L6-v2** model, try setting **HF_TOKEN** to a valid HuggingFace token in your environment
+```
+
+### Next Steps
+
+Now you're ready to dive deeper into Llama Stack!
+- Explore the [Detailed Tutorial](./detailed_tutorial.md).
+- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
+- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
+- Learn about Llama Stack [Concepts](../concepts/index.md).
+- Discover how to [Build Llama Stacks](../distributions/index.md).
+- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
+- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -40,17 +40,6 @@ Kotlin.
 - Ready to build? Check out the [Quick Start](getting_started/index) to get started.
 - Want to contribute? See the [Contributing](contributing/index) guide.

-## Client SDKs
-
-We have a number of client-side SDKs available for different languages.
-
-|  **Language** |  **Client SDK** | **Package** |
-| :----: | :----: | :----: |
-| Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
-| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
-| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
-| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
-
 ## Supported Llama Stack Implementations

 A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
@ -133,14 +122,12 @@ A number of "adapters" are available for some popular Inference and Vector Store

 self
 getting_started/index
-getting_started/detailed_tutorial
-introduction/index
 concepts/index
-openai/index
 providers/index
 distributions/index
+advanced_apis/index
 building_applications/index
-playground/index
+deploying/index
 contributing/index
 references/index
 ```
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -1,4 +1,4 @@
-# Providers Overview
+# API Providers Overview

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
@ -13,13 +13,25 @@ Providers come in two flavors:
 Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.

 ## External Providers
-
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.

 ```{toctree}
 :maxdepth: 1

-external
+external.md
+```
+
+```{include} openai.md
+:start-after: ## OpenAI API Compatibility
+```
+
+## Inference
+Runs inference with an LLM.
+
+```{toctree}
+:maxdepth: 1
+
+inference/index
 ```

 ## Agents
@ -40,33 +52,6 @@ Interfaces with datasets and data loaders.
 datasetio/index
 ```

-## Eval
-Generates outputs (via Inference or Agents) and perform scoring.
-
-```{toctree}
-:maxdepth: 1
-
-eval/index
-```
-
-## Inference
-Runs inference with an LLM.
-
-```{toctree}
-:maxdepth: 1
-
-inference/index
-```
-
-## Post Training
-Fine-tunes a model.
-
-```{toctree}
-:maxdepth: 1
-
-post_training/index
-```
-
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.

@ -76,15 +61,6 @@ Applies safety policies to the output at a Systems (not only model) level.
 safety/index
 ```

-## Scoring
-Evaluates the outputs of the system.
-
-```{toctree}
-:maxdepth: 1
-
-scoring/index
-```
-
 ## Telemetry
 Collects telemetry data from the system.

@ -94,15 +70,6 @@ Collects telemetry data from the system.
 telemetry/index
 ```

-## Tool Runtime
-Is associated with the ToolGroup resouces.
-
-```{toctree}
-:maxdepth: 1
-
-tool_runtime/index
-```
-
 ## Vector IO

 Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
@ -114,3 +81,12 @@ io and database are used to store and retrieve documents for retrieval.

 vector_io/index
 ```
+
+## Tool Runtime
+Is associated with the ToolGroup resources.
+
+```{toctree}
+:maxdepth: 1
+
+tool_runtime/index
+```
--- a/docs/source/providers/openai.md
+++ b/docs/source/providers/openai.md
@ -1,14 +1,14 @@
-# OpenAI API Compatibility
+## OpenAI API Compatibility

-## Server path
+### Server path

 Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.

-## Clients
+### Clients

 You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.

-### Llama Stack Client
+#### Llama Stack Client

 When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.

@ -18,7 +18,7 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url="http://localhost:8321")
 ```

-### OpenAI Client
+#### OpenAI Client

 When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.

@ -30,9 +30,9 @@ client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")

 Regardless of the client you choose, the following code examples should all work the same.

-## APIs implemented
+### APIs implemented

-### Models
+#### Models

 Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:

@ -40,13 +40,13 @@ Many of the APIs require you to pass in a model parameter. To see the list of mo
 models = client.models.list()
 ```

-### Responses
+#### Responses

 :::{note}
 The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
 :::

-#### Simple inference
+##### Simple inference

 Request:

@ -66,7 +66,7 @@ Syntax whispers secrets sweet
 Code's gentle silence
 ```

-#### Structured Output
+##### Structured Output

 Request:

@ -106,9 +106,9 @@ Example output:
 { "participants": ["Alice", "Bob"] }
 ```

-### Chat Completions
+#### Chat Completions

-#### Simple inference
+##### Simple inference

 Request:

@ -129,7 +129,7 @@ Logic flows like a river
 Code's gentle beauty
 ```

-#### Structured Output
+##### Structured Output

 Request:

@ -170,9 +170,9 @@ Example output:
 { "participants": ["Alice", "Bob"] }
 ```

-### Completions
+#### Completions

-#### Simple inference
+##### Simple inference

 Request:

--- a/docs/source/providers/vector_io/inline_milvus.md
+++ b/docs/source/providers/vector_io/inline_milvus.md
@ -11,7 +11,8 @@ Please refer to the remote provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
+| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |

 ## Sample Configuration

--- a/docs/source/providers/vector_io/inline_sqlite-vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite-vec.md
@ -205,12 +205,16 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db

 ```

--- a/docs/source/providers/vector_io/inline_sqlite_vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite_vec.md
@ -10,12 +10,16 @@ Please refer to the sqlite-vec provider documentation.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db

 ```

--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@ -114,6 +114,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
 | `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

 > **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
@ -123,6 +124,9 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 ```yaml
 uri: ${env.MILVUS_ENDPOINT}
 token: ${env.MILVUS_TOKEN}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db

 ```

--- a/docs/source/providers/vector_io/remote_pgvector.md
+++ b/docs/source/providers/vector_io/remote_pgvector.md
@ -40,6 +40,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 | `db` | `str \| None` | No | postgres |  |
 | `user` | `str \| None` | No | postgres |  |
 | `password` | `str \| None` | No | mysecretpassword |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

@ -49,6 +50,9 @@ port: ${env.PGVECTOR_PORT:=5432}
 db: ${env.PGVECTOR_DB}
 user: ${env.PGVECTOR_USER}
 password: ${env.PGVECTOR_PASSWORD}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db

 ```

--- a/docs/source/providers/vector_io/remote_weaviate.md
+++ b/docs/source/providers/vector_io/remote_weaviate.md
@ -36,7 +36,9 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
 ## Sample Configuration

 ```yaml
-{}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db

 ```

--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -9,7 +9,8 @@ The `llama-stack-client` CLI allows you to query information about the distribut
 llama-stack-client
 Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...

-  Welcome to the LlamaStackClient CLI
+  Welcome to the llama-stack-client CLI - a command-line interface for
+  interacting with Llama Stack

 Options:
  --version        Show the version and exit.
@ -35,6 +36,7 @@ Commands:
 ```

 ### `llama-stack-client configure`
+Configure Llama Stack Client CLI.
 ```bash
 llama-stack-client configure
 > Enter the host name of the Llama Stack distribution server: localhost
@ -42,7 +44,24 @@ llama-stack-client configure
 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
 ```

+Optional arguments:
+- `--endpoint`: Llama Stack distribution endpoint
+- `--api-key`: Llama Stack distribution API key
+
+
+
+## `llama-stack-client inspect version`
+Inspect server configuration.
+```bash
+llama-stack-client inspect version
+```
+```bash
+VersionInfo(version='0.2.14')
+```
+
+
 ### `llama-stack-client providers list`
+Show available providers on distribution endpoint
 ```bash
 llama-stack-client providers list
 ```
@ -66,9 +85,74 @@ llama-stack-client providers list
 +-----------+----------------+-----------------+
 ```

+### `llama-stack-client providers inspect`
+Show specific provider configuration on distribution endpoint
+```bash
+llama-stack-client providers inspect <provider_id>
+```
+
+
+## Inference
+Inference (chat).
+
+
+### `llama-stack-client inference chat-completion`
+Show available inference chat completion endpoints on distribution endpoint
+```bash
+llama-stack-client inference chat-completion --message <message> [--stream] [--session] [--model-id]
+```
+```bash
+OpenAIChatCompletion(
+    id='chatcmpl-aacd11f3-8899-4ec5-ac5b-e655132f6891',
+    choices=[
+        OpenAIChatCompletionChoice(
+            finish_reason='stop',
+            index=0,
+            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                role='assistant',
+                content='The captain of the whaleship Pequod in Nathaniel Hawthorne\'s novel "Moby-Dick" is Captain
+Ahab. He\'s a vengeful and obsessive old sailor who\'s determined to hunt down and kill the white sperm whale
+Moby-Dick, whom he\'s lost his leg to in a previous encounter.',
+                name=None,
+                tool_calls=None,
+                refusal=None,
+                annotations=None,
+                audio=None,
+                function_call=None
+            ),
+            logprobs=None
+        )
+    ],
+    created=1752578797,
+    model='llama3.2:3b-instruct-fp16',
+    object='chat.completion',
+    service_tier=None,
+    system_fingerprint='fp_ollama',
+    usage={
+        'completion_tokens': 67,
+        'prompt_tokens': 33,
+        'total_tokens': 100,
+        'completion_tokens_details': None,
+        'prompt_tokens_details': None
+    }
+)
+```
+
+Required arguments:
+**Note:** At least one of these parameters is required for chat completion
+- `--message`: Message
+- `--session`: Start a Chat Session
+
+Optional arguments:
+- `--stream`: Stream
+- `--model-id`: Model ID
+
 ## Model Management
+Manage GenAI models.
+

 ### `llama-stack-client models list`
+Show available llama models at distribution endpoint
 ```bash
 llama-stack-client models list
 ```
@ -85,6 +169,7 @@ Total models: 1
 ```

 ### `llama-stack-client models get`
+Show details of a specific model at the distribution endpoint
 ```bash
 llama-stack-client models get Llama3.1-8B-Instruct
 ```
@ -105,69 +190,92 @@ Model RandomModel is not found at distribution endpoint host:port. Please ensure
 ```

 ### `llama-stack-client models register`
-
+Register a new model at distribution endpoint
 ```bash
-llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
+llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>] [--model-type <model_type>]
 ```

-### `llama-stack-client models update`
+Required arguments:
+- `MODEL_ID`: Model ID
+- `--provider-id`: Provider ID for the model

+Optional arguments:
+- `--provider-model-id`: Provider's model ID
+- `--metadata`: JSON metadata for the model
+- `--model-type`: Model type: `llm`, `embedding`
+
+
+### `llama-stack-client models unregister`
+Unregister a model from distribution endpoint
 ```bash
-llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
-```
-
-### `llama-stack-client models delete`
-
-```bash
-llama-stack-client models delete <model_id>
+llama-stack-client models unregister <model_id>
 ```

 ## Vector DB Management
+Manage vector databases.
+

 ### `llama-stack-client vector_dbs list`
+Show available vector dbs on distribution endpoint
 ```bash
 llama-stack-client vector_dbs list
 ```
 ```
-+--------------+----------------+---------------------+---------------+------------------------+
-| identifier   | provider_id    | provider_resource_id| vector_db_type| params                |
-+==============+================+=====================+===============+========================+
-| test_bank    | meta-reference | test_bank          | vector        | embedding_model: all-MiniLM-L6-v2
-                                                                      embedding_dimension: 384|
-+--------------+----------------+---------------------+---------------+------------------------+
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier               ┃ provider_id ┃ provider_resource_id     ┃ vector_db_type ┃ params                            ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ my_demo_vector_db        │ faiss       │ my_demo_vector_db        │                │ embedding_dimension: 384          │
+│                          │             │                          │                │ embedding_model: all-MiniLM-L6-v2 │
+│                          │             │                          │                │ type: vector_db                   │
+│                          │             │                          │                │                                   │
+└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
 ```

 ### `llama-stack-client vector_dbs register`
+Create a new vector db
 ```bash
 llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```

+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
 Optional arguments:
 - `--provider-id`: Provider ID for the vector db
 - `--provider-vector-db-id`: Provider's vector db ID
- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
+- `--embedding-model`: Embedding model to use. Default: `all-MiniLM-L6-v2`
 - `--embedding-dimension`: Dimension of embeddings. Default: 384

 ### `llama-stack-client vector_dbs unregister`
+Delete a vector db
 ```bash
 llama-stack-client vector_dbs unregister <vector-db-id>
 ```

+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+
 ## Shield Management
+Manage safety shield services.
 ### `llama-stack-client shields list`
+Show available safety shields on distribution endpoint
 ```bash
 llama-stack-client shields list
 ```

 ```
-+--------------+----------+----------------+-------------+
-| identifier   | params   | provider_id    | type        |
-+==============+==========+================+=============+
-| llama_guard  | {}       | meta-reference | llama_guard |
-+--------------+----------+----------------+-------------+
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier                       ┃ provider_alias                                                        ┃ params                ┃ provider_id                        ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ ollama                           │ ollama/llama-guard3:1b                                                │                       │ llama-guard                        │
+└──────────────────────────────────┴───────────────────────────────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────┘
 ```

 ### `llama-stack-client shields register`
+Register a new safety shield
 ```bash
 llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
 ```
@ -180,41 +288,29 @@ Optional arguments:
 - `--provider-shield-id`: Provider's shield ID
 - `--params`: JSON configuration parameters for the shield

-## Eval Task Management
-
-### `llama-stack-client benchmarks list`
-```bash
-llama-stack-client benchmarks list
-```
-
-### `llama-stack-client benchmarks register`
-```bash
-llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
-```
-
-Required arguments:
- `--eval-task-id`: ID of the eval task
- `--dataset-id`: ID of the dataset to evaluate
- `--scoring-functions`: One or more scoring functions to use for evaluation
-
-Optional arguments:
- `--provider-id`: Provider ID for the eval task
- `--provider-eval-task-id`: Provider's eval task ID
- `--metadata`: Metadata for the eval task in JSON format

 ## Eval execution
+Run evaluation tasks.
+
+
 ### `llama-stack-client eval run-benchmark`
+Run a evaluation benchmark task
 ```bash
-llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> --model-id <model-id> [--num-examples <num>] [--visualize] [--repeat-penalty <repeat-penalty>] [--top-p <top-p>] [--max-tokens <max-tokens>]
 ```

 Required arguments:
 - `--eval-task-config`: Path to the eval task config file in JSON format
 - `--output-dir`: Path to the directory where evaluation results will be saved
+- `--model-id`: model id to run the benchmark eval on

 Optional arguments:
 - `--num-examples`: Number of examples to evaluate (useful for debugging)
 - `--visualize`: If set, visualizes evaluation results after completion
+- `--repeat-penalty`: repeat-penalty in the sampling params to run generation
+- `--top-p`: top-p in the sampling params to run generation
+- `--max-tokens`: max-tokens in the sampling params to run generation
+- `--temperature`: temperature in the sampling params to run generation

 Example benchmark_config.json:
 ```json
@ -231,21 +327,55 @@ Example benchmark_config.json:
 ```

 ### `llama-stack-client eval run-scoring`
+Run scoring from application datasets
 ```bash
-llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+llama-stack-client eval run-scoring <eval-task-id> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

 Required arguments:
- `--eval-task-config`: Path to the eval task config file in JSON format
 - `--output-dir`: Path to the directory where scoring results will be saved

 Optional arguments:
 - `--num-examples`: Number of examples to evaluate (useful for debugging)
 - `--visualize`: If set, visualizes scoring results after completion
+- `--scoring-params-config`: Path to the scoring params config file in JSON format
+- `--dataset-id`: Pre-registered dataset_id to score (from llama-stack-client datasets list)
+- `--dataset-path`: Path to the dataset file to score
+
+
+## Eval Tasks
+Manage evaluation tasks.
+
+### `llama-stack-client eval_tasks list`
+Show available eval tasks on distribution endpoint
+```bash
+llama-stack-client eval_tasks list
+```
+
+
+### `llama-stack-client eval_tasks register`
+Register a new eval task
+```bash
+llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <scoring-functions> [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
+```
+
+
+Required arguments:
+- `--eval-task-id`: ID of the eval task
+- `--dataset-id`: ID of the dataset to evaluate
+- `--scoring-functions`: Scoring functions to use for evaluation
+
+Optional arguments:
+- `--provider-id`: Provider ID for the eval task
+- `--provider-eval-task-id`: Provider's eval task ID
+

 ## Tool Group Management
+Manage available tool groups.
+

 ### `llama-stack-client toolgroups list`
+Show available llama toolgroups at distribution endpoint
 ```bash
 llama-stack-client toolgroups list
 ```
@ -260,17 +390,28 @@ llama-stack-client toolgroups list
 ```

 ### `llama-stack-client toolgroups get`
+Get available llama toolgroups by id
 ```bash
 llama-stack-client toolgroups get <toolgroup_id>
 ```

 Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.

+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
+
 ### `llama-stack-client toolgroups register`
+Register a new toolgroup at distribution endpoint
 ```bash
 llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
 ```

+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
 Optional arguments:
 - `--provider-id`: Provider ID for the toolgroup
 - `--provider-toolgroup-id`: Provider's toolgroup ID
@ -278,6 +419,172 @@ Optional arguments:
 - `--args`: JSON arguments for the toolgroup

 ### `llama-stack-client toolgroups unregister`
+Unregister a toolgroup from distribution endpoint
 ```bash
 llama-stack-client toolgroups unregister <toolgroup_id>
 ```
+
+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
+
+## Datasets Management
+Manage datasets.
+
+
+### `llama-stack-client datasets list`
+Show available datasets on distribution endpoint
+```bash
+llama-stack-client datasets list
+```
+
+
+### `llama-stack-client datasets register`
+```bash
+llama-stack-client datasets register --dataset_id <dataset_id> --purpose <purpose> [--url <url] [--dataset-path <dataset-path>] [--dataset-id <dataset-id>] [--metadata <metadata>]
+```
+
+Required arguments:
+- `--dataset_id`: Id of the dataset
+- `--purpose`: Purpose of the dataset
+
+Optional arguments:
+- `--metadata`: Metadata of the dataset
+- `--url`: URL of the dataset
+- `--dataset-path`: Local file path to the dataset. If specified, upload dataset via URL
+
+
+### `llama-stack-client datasets unregister`
+Remove a dataset
+```bash
+llama-stack-client datasets unregister <dataset-id>
+```
+
+
+Required arguments:
+- `DATASET_ID`: Id of the dataset
+
+
+## Scoring Functions Management
+Manage scoring functions.
+
+### `llama-stack-client scoring_functions list`
+Show available scoring functions on distribution endpoint
+```bash
+llama-stack-client scoring_functions list
+```
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
+┃ identifier                                 ┃ provider_id  ┃ description                                                   ┃ type             ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
+│ basic::bfcl                                │ basic        │ BFCL complex scoring                                          │ scoring_function │
+│ basic::docvqa                              │ basic        │ DocVQA Visual Question & Answer scoring function              │ scoring_function │
+│ basic::equality                            │ basic        │ Returns 1.0 if the input is equal to the target, 0.0          │ scoring_function │
+│                                            │              │ otherwise.                                                    │                  │
+└────────────────────────────────────────────┴──────────────┴───────────────────────────────────────────────────────────────┴──────────────────┘
+```
+
+
+### `llama-stack-client scoring_functions register`
+Register a new scoring function
+```bash
+llama-stack-client scoring_functions register --scoring-fn-id <scoring-fn-id> --description <description> --return-type <return-type> [--provider-id <provider-id>] [--provider-scoring-fn-id <provider-scoring-fn-id>] [--params <params>]
+```
+
+
+Required arguments:
+- `--scoring-fn-id`: Id of the scoring function
+- `--description`: Description of the scoring function
+- `--return-type`: Return type of the scoring function
+
+Optional arguments:
+- `--provider-id`: Provider ID for the scoring function
+- `--provider-scoring-fn-id`: Provider's scoring function ID
+- `--params`: Parameters for the scoring function in JSON format
+
+
+## Post Training Management
+Post-training.
+
+### `llama-stack-client post_training list`
+Show the list of available post training jobs
+```bash
+llama-stack-client post_training list
+```
+```bash
+["job-1", "job-2", "job-3"]
+```
+
+
+### `llama-stack-client post_training artifacts`
+Get the training artifacts of a specific post training job
+```bash
+llama-stack-client post_training artifacts --job-uuid <job-uuid>
+```
+```bash
+JobArtifactsResponse(checkpoints=[], job_uuid='job-1')
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+
+
+### `llama-stack-client post_training supervised_fine_tune`
+Kick off a supervised fine tune job
+```bash
+llama-stack-client post_training supervised_fine_tune --job-uuid <job-uuid> --model <model> --algorithm-config <algorithm-config> --training-config <training-config> [--checkpoint-dir <checkpoint-dir>]
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+- `--model`: Model ID
+- `--algorithm-config`: Algorithm Config
+- `--training-config`: Training Config
+
+Optional arguments:
+- `--checkpoint-dir`: Checkpoint Config
+
+
+### `llama-stack-client post_training status`
+Show the status of a specific post training job
+```bash
+llama-stack-client post_training status --job-uuid <job-uuid>
+```
+```bash
+JobStatusResponse(
+    checkpoints=[],
+    job_uuid='job-1',
+    status='completed',
+    completed_at="",
+    resources_allocated="",
+    scheduled_at="",
+    started_at=""
+)
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+
+
+### `llama-stack-client post_training cancel`
+Cancel the training job
+```bash
+llama-stack-client post_training cancel --job-uuid <job-uuid>
+```
+```bash
+# This functionality is not yet implemented for llama-stack-client
+╭────────────────────────────────────────────────────────────╮
+│ Failed to post_training cancel_training_job                │
+│                                                            │
+│ Error Type: InternalServerError                            │
+│ Details: Error code: 501 - {'detail': 'Not implemented: '} │
+╰────────────────────────────────────────────────────────────╯
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
--- a/llama_stack/apis/common/training_types.py
+++ b/llama_stack/apis/common/training_types.py
@ -19,8 +19,10 @@ class PostTrainingMetric(BaseModel):
    perplexity: float


-@json_schema_type(schema={"description": "Checkpoint created during training runs"})
+@json_schema_type
 class Checkpoint(BaseModel):
+    """Checkpoint created during training runs"""
+
    identifier: str
    created_at: datetime
    epoch: int
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -7,7 +7,7 @@
 from enum import StrEnum
 from typing import Any, Literal, Protocol, runtime_checkable

-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator

 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -36,13 +36,21 @@ class Model(CommonModelFields, Resource):
        return self.identifier

    @property
-    def provider_model_id(self) -> str | None:
+    def provider_model_id(self) -> str:
+        assert self.provider_resource_id is not None, "Provider resource ID must be set"
        return self.provider_resource_id

    model_config = ConfigDict(protected_namespaces=())

    model_type: ModelType = Field(default=ModelType.llm)

+    @field_validator("provider_resource_id")
+    @classmethod
+    def validate_provider_resource_id(cls, v):
+        if v is None:
+            raise ValueError("provider_resource_id cannot be None")
+        return v
+

 class ModelInput(CommonModelFields):
    model_id: str
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -87,6 +87,20 @@ class RAGQueryGenerator(Enum):
    custom = "custom"


+@json_schema_type
+class RAGSearchMode(Enum):
+    """
+    Search modes for RAG query retrieval:
+    - VECTOR: Uses vector similarity search for semantic matching
+    - KEYWORD: Uses keyword-based search for exact matching
+    - HYBRID: Combines both vector and keyword search for better results
+    """
+
+    VECTOR = "vector"
+    KEYWORD = "keyword"
+    HYBRID = "hybrid"
+
+
@json_schema_type
 class DefaultRAGQueryGeneratorConfig(BaseModel):
    type: Literal["default"] = "default"
@ -128,7 +142,7 @@ class RAGQueryConfig(BaseModel):
    max_tokens_in_context: int = 4096
    max_chunks: int = 5
    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: str | None = None
+    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode

    @field_validator("chunk_template")
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -19,6 +19,7 @@ class VectorDB(Resource):

    embedding_model: str
    embedding_dimension: int
+    vector_db_name: str | None = None

    @property
    def vector_db_id(self) -> str:
@ -70,6 +71,7 @@ class VectorDBs(Protocol):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> VectorDB:
        """Register a vector database.
@ -78,6 +80,7 @@ class VectorDBs(Protocol):
        :param embedding_model: The embedding model to use.
        :param embedding_dimension: The dimension of the embedding model.
        :param provider_id: The identifier of the provider.
+        :param vector_db_name: The name of the vector database.
        :param provider_vector_db_id: The identifier of the vector database in the provider.
        :returns: A VectorDB.
        """
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -346,7 +346,6 @@ class VectorIO(Protocol):
        embedding_model: str | None = None,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        """Creates a vector store.

@ -358,7 +357,6 @@ class VectorIO(Protocol):
        :param embedding_model: The embedding model to use for this vector store.
        :param embedding_dimension: The dimension of the embedding vectors (default: 384).
        :param provider_id: The ID of the provider to use for this vector store.
-        :param provider_vector_db_id: The provider-specific vector database ID.
        :returns: A VectorStoreObject representing the created vector store.
        """
        ...
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -93,7 +93,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
            )
            sys.exit(1)
    elif args.providers:
-        providers = dict()
+        providers_list: dict[str, str | list[str]] = dict()
        for api_provider in args.providers.split(","):
            if "=" not in api_provider:
                cprint(
@ -112,7 +112,15 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                )
                sys.exit(1)
            if provider in providers_for_api:
-                providers.setdefault(api, []).append(provider)
+                if api not in providers_list:
+                    providers_list[api] = []
+                # Use type guarding to ensure we have a list
+                provider_value = providers_list[api]
+                if isinstance(provider_value, list):
+                    provider_value.append(provider)
+                else:
+                    # Convert string to list and append
+                    providers_list[api] = [provider_value, provider]
            else:
                cprint(
                    f"{provider} is not a valid provider for the {api} API.",
@ -121,7 +129,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                )
                sys.exit(1)
        distribution_spec = DistributionSpec(
-            providers=providers,
+            providers=providers_list,
            description=",".join(args.providers),
        )
        if not args.image_type:
@ -182,7 +190,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:

        cprint("Tip: use <TAB> to see options for the providers.\n", color="green", file=sys.stderr)

-        providers = dict()
+        providers: dict[str, str | list[str]] = dict()
        for api, providers_for_api in get_provider_registry().items():
            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
            if not available_providers:
@ -371,10 +379,16 @@ def _run_stack_build_command_from_build_config(
        if not image_name:
            raise ValueError("Please specify an image name when building a venv image")

+    # At this point, image_name should be guaranteed to be a string
+    if image_name is None:
+        raise ValueError("image_name should not be None after validation")
+
    if template_name:
        build_dir = DISTRIBS_BASE_DIR / template_name
        build_file_path = build_dir / f"{template_name}-build.yaml"
    else:
+        if image_name is None:
+            raise ValueError("image_name cannot be None")
        build_dir = DISTRIBS_BASE_DIR / image_name
        build_file_path = build_dir / f"{image_name}-build.yaml"

@ -395,7 +409,7 @@ def _run_stack_build_command_from_build_config(
        build_file_path,
        image_name,
        template_or_config=template_name or config_path or str(build_file_path),
-        run_config=run_config_file,
+        run_config=run_config_file.as_posix() if run_config_file else None,
    )
    if return_code != 0:
        raise RuntimeError(f"Failed to build image {image_name}")
@ -403,15 +417,16 @@ def _run_stack_build_command_from_build_config(
    if template_name:
        # copy run.yaml from template to build_dir instead of generating it again
        template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
+        run_config_file = build_dir / f"{template_name}-run.yaml"
+
        with importlib.resources.as_file(template_path) as path:
-            run_config_file = build_dir / f"{template_name}-run.yaml"
            shutil.copy(path, run_config_file)

        cprint("Build Successful!", color="green", file=sys.stderr)
-        cprint(f"You can find the newly-built template here: {template_path}", color="blue", file=sys.stderr)
+        cprint(f"You can find the newly-built template here: {run_config_file}", color="blue", file=sys.stderr)
        cprint(
            "You can run the new Llama Stack distro via: "
-            + colored(f"llama stack run {template_path} --image-type {build_config.image_type}", "blue"),
+            + colored(f"llama stack run {run_config_file} --image-type {build_config.image_type}", "blue"),
            color="green",
            file=sys.stderr,
        )
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -47,8 +47,7 @@ class StackRun(Subcommand):
        self.parser.add_argument(
            "--image-name",
            type=str,
-            default=os.environ.get("CONDA_DEFAULT_ENV"),
-            help="Name of the image to run. Defaults to the current environment",
+            help="Name of the image to run.",
        )
        self.parser.add_argument(
            "--env",
@ -83,46 +82,57 @@ class StackRun(Subcommand):
            return ImageType.CONDA.value, args.image_name
        return args.image_type, args.image_name

+    def _resolve_config_and_template(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
+        """Resolve config file path and template name from args.config"""
+        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+
+        if not args.config:
+            return None, None
+
+        config_file = Path(args.config)
+        has_yaml_suffix = args.config.endswith(".yaml")
+        template_name = None
+
+        if not config_file.exists() and not has_yaml_suffix:
+            # check if this is a template
+            config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
+            if config_file.exists():
+                template_name = args.config
+
+        if not config_file.exists() and not has_yaml_suffix:
+            # check if it's a build config saved to ~/.llama dir
+            config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
+
+        if not config_file.exists():
+            self.parser.error(
+                f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
+            )
+
+        if not config_file.is_file():
+            self.parser.error(
+                f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
+            )
+
+        return config_file, template_name
+
    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import yaml

        from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
        from llama_stack.distribution.utils.exec import formulate_run_args, run_command

        if args.enable_ui:
            self._start_ui_development_server(args.port)
        image_type, image_name = self._get_image_type_and_name(args)

+        # Resolve config file and template name first
+        config_file, template_name = self._resolve_config_and_template(args)
+
        # Check if config is required based on image type
-        if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not args.config:
+        if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not config_file:
            self.parser.error("Config file is required for venv and conda environments")

-        if args.config:
-            config_file = Path(args.config)
-            has_yaml_suffix = args.config.endswith(".yaml")
-            template_name = None
-
-            if not config_file.exists() and not has_yaml_suffix:
-                # check if this is a template
-                config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
-                if config_file.exists():
-                    template_name = args.config
-
-            if not config_file.exists() and not has_yaml_suffix:
-                # check if it's a build config saved to ~/.llama dir
-                config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
-
-            if not config_file.exists():
-                self.parser.error(
-                    f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
-                )
-
-            if not config_file.is_file():
-                self.parser.error(
-                    f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
-                )
-
+        if config_file:
            logger.info(f"Using run configuration: {config_file}")

            try:
@ -138,8 +148,6 @@ class StackRun(Subcommand):
                self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
        else:
            config = None
-            config_file = None
-            template_name = None

        # If neither image type nor image name is provided, assume the server should be run directly
        # using the current environment packages.
@ -155,8 +163,12 @@ class StackRun(Subcommand):
                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
                if callable(getattr(args, arg)):
                    continue
-                if arg == "config" and template_name:
-                    server_args.config = str(config_file)
+                if arg == "config":
+                    if template_name:
+                        server_args.template = str(template_name)
+                    else:
+                        # Set the config file path
+                        server_args.config = str(config_file)
                else:
                    setattr(server_args, arg, getattr(args, arg))

--- a/llama_stack/distribution/access_control/access_control.py
+++ b/llama_stack/distribution/access_control/access_control.py
@ -81,7 +81,7 @@ def is_action_allowed(
    if not len(policy):
        policy = default_policy()

-    qualified_resource_id = resource.type + "::" + resource.identifier
+    qualified_resource_id = f"{resource.type}::{resource.identifier}"
    for rule in policy:
        if rule.forbid and matches_scope(rule.forbid, action, qualified_resource_id, user.principal):
            if rule.when:
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -96,7 +96,7 @@ FROM $container_base
 WORKDIR /app

 # We install the Python 3.12 dev headers and build tools so that any
-# C‑extension wheels (e.g. polyleven, faiss‑cpu) can compile successfully.
+# C-extension wheels (e.g. polyleven, faiss-cpu) can compile successfully.

 RUN dnf -y update && dnf install -y iputils git net-tools wget \
    vim-minimal python3.12 python3.12-pip python3.12-wheel \
@ -169,7 +169,7 @@ if [ -n "$run_config" ]; then
    echo "Copying external providers directory: $external_providers_dir"
    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
    add_to_container << EOF
-COPY --chmod=g+w providers.d /.llama/providers.d
+COPY providers.d /.llama/providers.d
 EOF
    fi

--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@ -17,7 +17,7 @@ from llama_stack.distribution.distribution import (
    builtin_automatically_routed_apis,
    get_provider_registry,
 )
-from llama_stack.distribution.stack import replace_env_vars
+from llama_stack.distribution.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
@ -164,7 +164,8 @@ def upgrade_from_routing_table(
 def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
    version = config_dict.get("version", None)
    if version == LLAMA_STACK_RUN_CONFIG_VERSION:
-        return StackRunConfig(**replace_env_vars(config_dict))
+        processed_config_dict = replace_env_vars(config_dict)
+        return StackRunConfig(**cast_image_name_to_string(processed_config_dict))

    if "routing_table" in config_dict:
        logger.info("Upgrading config...")
@ -175,4 +176,5 @@ def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfi
    if not config_dict.get("external_providers_dir", None):
        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR

-    return StackRunConfig(**replace_env_vars(config_dict))
+    processed_config_dict = replace_env_vars(config_dict)
+    return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -6,9 +6,9 @@

 from enum import StrEnum
 from pathlib import Path
-from typing import Annotated, Any
+from typing import Annotated, Any, Literal, Self

-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, model_validator

 from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
@ -161,23 +161,113 @@ class LoggingConfig(BaseModel):
    )


+class OAuth2JWKSConfig(BaseModel):
+    # The JWKS URI for collecting public keys
+    uri: str
+    token: str | None = Field(default=None, description="token to authorise access to jwks")
+    key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
+
+
+class OAuth2IntrospectionConfig(BaseModel):
+    url: str
+    client_id: str
+    client_secret: str
+    send_secret_in_body: bool = False
+
+
 class AuthProviderType(StrEnum):
    """Supported authentication provider types."""

    OAUTH2_TOKEN = "oauth2_token"
+    GITHUB_TOKEN = "github_token"
    CUSTOM = "custom"


+class OAuth2TokenAuthConfig(BaseModel):
+    """Configuration for OAuth2 token authentication."""
+
+    type: Literal[AuthProviderType.OAUTH2_TOKEN] = AuthProviderType.OAUTH2_TOKEN
+    audience: str = Field(default="llama-stack")
+    verify_tls: bool = Field(default=True)
+    tls_cafile: Path | None = Field(default=None)
+    issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
+    claims_mapping: dict[str, str] = Field(
+        default_factory=lambda: {
+            "sub": "roles",
+            "username": "roles",
+            "groups": "teams",
+            "team": "teams",
+            "project": "projects",
+            "tenant": "namespaces",
+            "namespace": "namespaces",
+        },
+    )
+    jwks: OAuth2JWKSConfig | None = Field(default=None, description="JWKS configuration")
+    introspection: OAuth2IntrospectionConfig | None = Field(
+        default=None, description="OAuth2 introspection configuration"
+    )
+
+    @classmethod
+    @field_validator("claims_mapping")
+    def validate_claims_mapping(cls, v):
+        for key, value in v.items():
+            if not value:
+                raise ValueError(f"claims_mapping value cannot be empty: {key}")
+        return v
+
+    @model_validator(mode="after")
+    def validate_mode(self) -> Self:
+        if not self.jwks and not self.introspection:
+            raise ValueError("One of jwks or introspection must be configured")
+        if self.jwks and self.introspection:
+            raise ValueError("At present only one of jwks or introspection should be configured")
+        return self
+
+
+class CustomAuthConfig(BaseModel):
+    """Configuration for custom authentication."""
+
+    type: Literal[AuthProviderType.CUSTOM] = AuthProviderType.CUSTOM
+    endpoint: str = Field(
+        ...,
+        description="Custom authentication endpoint URL",
+    )
+
+
+class GitHubTokenAuthConfig(BaseModel):
+    """Configuration for GitHub token authentication."""
+
+    type: Literal[AuthProviderType.GITHUB_TOKEN] = AuthProviderType.GITHUB_TOKEN
+    github_api_base_url: str = Field(
+        default="https://api.github.com",
+        description="Base URL for GitHub API (use https://api.github.com for public GitHub)",
+    )
+    claims_mapping: dict[str, str] = Field(
+        default_factory=lambda: {
+            "login": "roles",
+            "organizations": "teams",
+        },
+        description="Mapping from GitHub user fields to access attributes",
+    )
+
+
+AuthProviderConfig = Annotated[
+    OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig,
+    Field(discriminator="type"),
+]
+
+
 class AuthenticationConfig(BaseModel):
-    provider_type: AuthProviderType = Field(
+    """Top-level authentication configuration."""
+
+    provider_config: AuthProviderConfig = Field(
        ...,
-        description="Type of authentication provider",
+        description="Authentication provider configuration",
    )
-    config: dict[str, Any] = Field(
-        ...,
-        description="Provider-specific configuration",
+    access_policy: list[AccessRule] = Field(
+        default=[],
+        description="Rules for determining access to resources",
    )
-    access_policy: list[AccessRule] = Field(default=[], description="Rules for determining access to resources")


 class AuthenticationRequiredError(Exception):
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -200,7 +200,7 @@ def validate_and_prepare_providers(
        specs = {}
        for provider in providers:
            if not provider.provider_id or provider.provider_id == "__disabled__":
-                logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
+                logger.debug(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
                continue

            validate_provider(provider, api, provider_registry)
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import asyncio
+import uuid
 from typing import Any

 from llama_stack.apis.common.content_types import (
@ -81,6 +82,7 @@ class VectorIORouter(VectorIO):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> None:
        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
@ -89,6 +91,7 @@ class VectorIORouter(VectorIO):
            embedding_model,
            embedding_dimension,
            provider_id,
+            vector_db_name,
            provider_vector_db_id,
        )

@ -123,7 +126,6 @@ class VectorIORouter(VectorIO):
        embedding_model: str | None = None,
        embedding_dimension: int | None = None,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_create_vector_store: name={name}, provider_id={provider_id}")

@ -135,17 +137,17 @@ class VectorIORouter(VectorIO):
            embedding_model, embedding_dimension = embedding_model_info
            logger.info(f"No embedding model specified, using first available: {embedding_model}")

-        vector_db_id = name
+        vector_db_id = f"vs_{uuid.uuid4()}"
        registered_vector_db = await self.routing_table.register_vector_db(
-            vector_db_id,
-            embedding_model,
-            embedding_dimension,
-            provider_id,
-            provider_vector_db_id,
+            vector_db_id=vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=embedding_dimension,
+            provider_id=provider_id,
+            provider_vector_db_id=vector_db_id,
+            vector_db_name=name,
        )
-
        return await self.routing_table.get_provider_impl(registered_vector_db.identifier).openai_create_vector_store(
-            vector_db_id,
+            name=name,
            file_ids=file_ids,
            expires_after=expires_after,
            chunking_strategy=chunking_strategy,
--- a/llama_stack/distribution/routing_tables/vector_dbs.py
+++ b/llama_stack/distribution/routing_tables/vector_dbs.py
@ -36,6 +36,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
        provider_vector_db_id: str | None = None,
+        vector_db_name: str | None = None,
    ) -> VectorDB:
        if provider_vector_db_id is None:
            provider_vector_db_id = vector_db_id
@ -62,6 +63,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            "provider_resource_id": provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
+            "vector_db_name": vector_db_name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@ -87,8 +87,12 @@ class AuthenticationMiddleware:
            headers = dict(scope.get("headers", []))
            auth_header = headers.get(b"authorization", b"").decode()

-            if not auth_header or not auth_header.startswith("Bearer "):
-                return await self._send_auth_error(send, "Missing or invalid Authorization header")
+            if not auth_header:
+                error_msg = self.auth_provider.get_auth_error_message(scope)
+                return await self._send_auth_error(send, error_msg)
+
+            if not auth_header.startswith("Bearer "):
+                return await self._send_auth_error(send, "Invalid Authorization header format")

            token = auth_header.split("Bearer ", 1)[1]

--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@ -8,15 +8,19 @@ import ssl
 import time
 from abc import ABC, abstractmethod
 from asyncio import Lock
-from pathlib import Path
-from typing import Self
-from urllib.parse import parse_qs
+from urllib.parse import parse_qs, urlparse

 import httpx
 from jose import jwt
-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, Field

-from llama_stack.distribution.datatypes import AuthenticationConfig, AuthProviderType, User
+from llama_stack.distribution.datatypes import (
+    AuthenticationConfig,
+    CustomAuthConfig,
+    GitHubTokenAuthConfig,
+    OAuth2TokenAuthConfig,
+    User,
+)
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")
@ -38,9 +42,7 @@ class AuthRequestContext(BaseModel):

    headers: dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")

-    params: dict[str, list[str]] = Field(
-        description="Query parameters from the original request, parsed as dictionary of lists"
-    )
+    params: dict[str, list[str]] = Field(default_factory=dict, description="Query parameters from the original request")


 class AuthRequest(BaseModel):
@ -62,6 +64,10 @@ class AuthProvider(ABC):
        """Clean up any resources."""
        pass

+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return provider-specific authentication error message."""
+        return "Authentication required"
+

 def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> dict[str, list[str]]:
    attributes: dict[str, list[str]] = {}
@ -81,56 +87,6 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
    return attributes


-class OAuth2JWKSConfig(BaseModel):
-    # The JWKS URI for collecting public keys
-    uri: str
-    token: str | None = Field(default=None, description="token to authorise access to jwks")
-    key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
-
-
-class OAuth2IntrospectionConfig(BaseModel):
-    url: str
-    client_id: str
-    client_secret: str
-    send_secret_in_body: bool = False
-
-
-class OAuth2TokenAuthProviderConfig(BaseModel):
-    audience: str = "llama-stack"
-    verify_tls: bool = True
-    tls_cafile: Path | None = None
-    issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
-    claims_mapping: dict[str, str] = Field(
-        default_factory=lambda: {
-            "sub": "roles",
-            "username": "roles",
-            "groups": "teams",
-            "team": "teams",
-            "project": "projects",
-            "tenant": "namespaces",
-            "namespace": "namespaces",
-        },
-    )
-    jwks: OAuth2JWKSConfig | None
-    introspection: OAuth2IntrospectionConfig | None = None
-
-    @classmethod
-    @field_validator("claims_mapping")
-    def validate_claims_mapping(cls, v):
-        for key, value in v.items():
-            if not value:
-                raise ValueError(f"claims_mapping value cannot be empty: {key}")
-        return v
-
-    @model_validator(mode="after")
-    def validate_mode(self) -> Self:
-        if not self.jwks and not self.introspection:
-            raise ValueError("One of jwks or introspection must be configured")
-        if self.jwks and self.introspection:
-            raise ValueError("At present only one of jwks or introspection should be configured")
-        return self
-
-
 class OAuth2TokenAuthProvider(AuthProvider):
    """
    JWT token authentication provider that validates a JWT token and extracts access attributes.
@ -138,7 +94,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
    This should be the standard authentication provider for most use cases.
    """

-    def __init__(self, config: OAuth2TokenAuthProviderConfig):
+    def __init__(self, config: OAuth2TokenAuthConfig):
        self.config = config
        self._jwks_at: float = 0.0
        self._jwks: dict[str, str] = {}
@ -170,7 +126,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
                issuer=self.config.issuer,
            )
        except Exception as exc:
-            raise ValueError(f"Invalid JWT token: {token}") from exc
+            raise ValueError("Invalid JWT token") from exc

        # There are other standard claims, the most relevant of which is `scope`.
        # We should incorporate these into the access attributes.
@ -232,6 +188,17 @@ class OAuth2TokenAuthProvider(AuthProvider):
    async def close(self):
        pass

+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return OAuth2-specific authentication error message."""
+        if self.config.issuer:
+            return f"Authentication required. Please provide a valid OAuth2 Bearer token from {self.config.issuer}"
+        elif self.config.introspection:
+            # Extract domain from introspection URL for a cleaner message
+            domain = urlparse(self.config.introspection.url).netloc
+            return f"Authentication required. Please provide a valid OAuth2 Bearer token validated by {domain}"
+        else:
+            return "Authentication required. Please provide a valid OAuth2 Bearer token in the Authorization header"
+
    async def _refresh_jwks(self) -> None:
        """
        Refresh the JWKS cache.
@ -264,14 +231,10 @@ class OAuth2TokenAuthProvider(AuthProvider):
                    self._jwks_at = time.time()


-class CustomAuthProviderConfig(BaseModel):
-    endpoint: str
-
-
 class CustomAuthProvider(AuthProvider):
    """Custom authentication provider that uses an external endpoint."""

-    def __init__(self, config: CustomAuthProviderConfig):
+    def __init__(self, config: CustomAuthConfig):
        self.config = config
        self._client = None

@ -317,7 +280,7 @@ class CustomAuthProvider(AuthProvider):
                try:
                    response_data = response.json()
                    auth_response = AuthResponse(**response_data)
-                    return User(auth_response.principal, auth_response.attributes)
+                    return User(principal=auth_response.principal, attributes=auth_response.attributes)
                except Exception as e:
                    logger.exception("Error parsing authentication response")
                    raise ValueError("Invalid authentication response format") from e
@ -338,15 +301,88 @@ class CustomAuthProvider(AuthProvider):
            await self._client.aclose()
            self._client = None

+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return custom auth provider-specific authentication error message."""
+        domain = urlparse(self.config.endpoint).netloc
+        if domain:
+            return f"Authentication required. Please provide your API key as a Bearer token (validated by {domain})"
+        else:
+            return "Authentication required. Please provide your API key as a Bearer token in the Authorization header"
+
+
+class GitHubTokenAuthProvider(AuthProvider):
+    """
+    GitHub token authentication provider that validates GitHub access tokens directly.
+
+    This provider accepts GitHub personal access tokens or OAuth tokens and verifies
+    them against the GitHub API to get user information.
+    """
+
+    def __init__(self, config: GitHubTokenAuthConfig):
+        self.config = config
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a GitHub token by calling the GitHub API.
+
+        This validates tokens issued by GitHub (personal access tokens or OAuth tokens).
+        """
+        try:
+            user_info = await _get_github_user_info(token, self.config.github_api_base_url)
+        except httpx.HTTPStatusError as e:
+            logger.warning(f"GitHub token validation failed: {e}")
+            raise ValueError("GitHub token validation failed. Please check your token and try again.") from e
+
+        principal = user_info["user"]["login"]
+
+        github_data = {
+            "login": user_info["user"]["login"],
+            "id": str(user_info["user"]["id"]),
+            "organizations": user_info.get("organizations", []),
+        }
+
+        access_attributes = get_attributes_from_claims(github_data, self.config.claims_mapping)
+
+        return User(
+            principal=principal,
+            attributes=access_attributes,
+        )
+
+    async def close(self):
+        """Clean up any resources."""
+        pass
+
+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return GitHub-specific authentication error message."""
+        return "Authentication required. Please provide a valid GitHub access token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) in the Authorization header (Bearer <token>)"
+
+
+async def _get_github_user_info(access_token: str, github_api_base_url: str) -> dict:
+    """Fetch user info and organizations from GitHub API."""
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Accept": "application/vnd.github.v3+json",
+        "User-Agent": "llama-stack",
+    }
+
+    async with httpx.AsyncClient() as client:
+        user_response = await client.get(f"{github_api_base_url}/user", headers=headers, timeout=10.0)
+        user_response.raise_for_status()
+        user_data = user_response.json()
+
+        return {
+            "user": user_data,
+        }
+

 def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
    """Factory function to create the appropriate auth provider."""
-    provider_type = config.provider_type.lower()
+    provider_config = config.provider_config

-    if provider_type == "custom":
-        return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
-    elif provider_type == "oauth2_token":
-        return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
+    if isinstance(provider_config, CustomAuthConfig):
+        return CustomAuthProvider(provider_config)
+    elif isinstance(provider_config, OAuth2TokenAuthConfig):
+        return OAuth2TokenAuthProvider(provider_config)
+    elif isinstance(provider_config, GitHubTokenAuthConfig):
+        return GitHubTokenAuthProvider(provider_config)
    else:
-        supported_providers = ", ".join([t.value for t in AuthProviderType])
-        raise ValueError(f"Unsupported auth provider type: {provider_type}. Supported types are: {supported_providers}")
+        raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -33,7 +33,11 @@ from pydantic import BaseModel, ValidationError

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.distribution.access_control.access_control import AccessDeniedError
-from llama_stack.distribution.datatypes import AuthenticationRequiredError, LoggingConfig, StackRunConfig
+from llama_stack.distribution.datatypes import (
+    AuthenticationRequiredError,
+    LoggingConfig,
+    StackRunConfig,
+)
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.request_headers import PROVIDER_DATA_VAR, User, request_provider_data_context
 from llama_stack.distribution.resolver import InvalidProviderError
@ -43,6 +47,7 @@ from llama_stack.distribution.server.routes import (
    initialize_route_impls,
 )
 from llama_stack.distribution.stack import (
+    cast_image_name_to_string,
    construct_stack,
    replace_env_vars,
    validate_env_pair,
@ -217,7 +222,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
        # Get auth attributes from the request scope
        user_attributes = request.scope.get("user_attributes", {})
        principal = request.scope.get("principal", "")
-        user = User(principal, user_attributes)
+        user = User(principal=principal, attributes=user_attributes)

        await log_request_pre_validation(request)

@ -405,13 +410,13 @@ def main(args: argparse.Namespace | None = None):
        args = parser.parse_args()

    log_line = ""
-    if args.config:
+    if hasattr(args, "config") and args.config:
        # if the user provided a config file, use it, even if template was specified
        config_file = Path(args.config)
        if not config_file.exists():
            raise ValueError(f"Config file {config_file} does not exist")
        log_line = f"Using config file: {config_file}"
-    elif args.template:
+    elif hasattr(args, "template") and args.template:
        config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
        if not config_file.exists():
            raise ValueError(f"Template {args.template} does not exist")
@ -435,14 +440,12 @@ def main(args: argparse.Namespace | None = None):
                    logger.error(f"Error: {str(e)}")
                    sys.exit(1)
        config = replace_env_vars(config_contents)
-        config = StackRunConfig(**config)
+        config = StackRunConfig(**cast_image_name_to_string(config))

    # now that the logger is initialized, print the line about which type of config we are using.
    logger.info(log_line)

-    logger.info("Run configuration:")
-    safe_config = redact_sensitive_fields(config.model_dump())
-    logger.info(yaml.dump(safe_config, indent=2))
+    _log_run_config(run_config=config)

    app = FastAPI(
        lifespan=lifespan,
@ -450,12 +453,13 @@ def main(args: argparse.Namespace | None = None):
        redoc_url="/redoc",
        openapi_url="/openapi.json",
    )
+
    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
        app.add_middleware(ClientVersionMiddleware)

    # Add authentication middleware if configured
    if config.server.auth:
-        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
+        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
    else:
        if config.server.quota:
@ -488,7 +492,13 @@ def main(args: argparse.Namespace | None = None):
        )

    try:
-        impls = asyncio.run(construct_stack(config))
+        # Create and set the event loop that will be used for both construction and server runtime
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        # Construct the stack in the persistent event loop
+        impls = loop.run_until_complete(construct_stack(config))
+
    except InvalidProviderError as e:
        logger.error(f"Error: {str(e)}")
        sys.exit(1)
@ -586,7 +596,16 @@ def main(args: argparse.Namespace | None = None):
    if ssl_config:
        uvicorn_config.update(ssl_config)

-    uvicorn.run(**uvicorn_config)
+    # Run uvicorn in the existing event loop to preserve background tasks
+    loop.run_until_complete(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
+
+
+def _log_run_config(run_config: StackRunConfig):
+    """Logs the run config with redacted fields and disabled providers removed."""
+    logger.info("Run configuration:")
+    safe_config = redact_sensitive_fields(run_config.model_dump(mode="json"))
+    clean_config = remove_disabled_providers(safe_config)
+    logger.info(yaml.dump(clean_config, indent=2))


 def extract_path_params(route: str) -> list[str]:
@ -597,5 +616,20 @@ def extract_path_params(route: str) -> list[str]:
    return params


+def remove_disabled_providers(obj):
+    if isinstance(obj, dict):
+        if (
+            obj.get("provider_id") == "__disabled__"
+            or obj.get("shield_id") == "__disabled__"
+            or obj.get("provider_model_id") == "__disabled__"
+        ):
+            return None
+        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
+    elif isinstance(obj, list):
+        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
+    else:
+        return obj
+
+
 if __name__ == "__main__":
    main()
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -98,6 +98,7 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):

        method = getattr(impls[api], register_method)
        for obj in objects:
+            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
            # Do not register models on disabled providers
            if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__":
                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
@ -112,6 +113,11 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
            ):
                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.")
                continue
+
+            if hasattr(obj, "shield_id") and obj.shield_id is not None and obj.shield_id == "__disabled__":
+                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled shield.")
+                continue
+
            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
            # we use model_dump() to find all the attrs and then getattr to get the still typed value.
@ -166,7 +172,6 @@ def replace_env_vars(config: Any, path: str = "") -> Any:
                            # Create a copy with resolved provider_id but original config
                            disabled_provider = v.copy()
                            disabled_provider["provider_id"] = resolved_provider_id
-                            result.append(disabled_provider)
                            continue
                    except EnvVarError:
                        # If we can't resolve the provider_id, continue with normal processing
@ -261,6 +266,13 @@ def _convert_string_to_proper_type(value: str) -> Any:
    return value


+def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that any value for a key 'image_name' in a config_dict is a string"""
+    if "image_name" in config_dict and config_dict["image_name"] is not None:
+        config_dict["image_name"] = str(config_dict["image_name"])
+    return config_dict
+
+
 def validate_env_pair(env_pair: str) -> tuple[str, str]:
    """Validate and split an environment variable key-value pair."""
    try:
--- a/llama_stack/distribution/utils/context.py
+++ b/llama_stack/distribution/utils/context.py
@ -6,12 +6,9 @@

 from collections.abc import AsyncGenerator
 from contextvars import ContextVar
-from typing import TypeVar
-
-T = TypeVar("T")


-def preserve_contexts_async_generator(
+def preserve_contexts_async_generator[T](
    gen: AsyncGenerator[T, None], context_vars: list[ContextVar]
 ) -> AsyncGenerator[T, None]:
    """
--- a/llama_stack/models/llama/llama3/chat_format.py
+++ b/llama_stack/models/llama/llama3/chat_format.py
@ -8,6 +8,7 @@ import io
 import json
 import uuid
 from dataclasses import dataclass
+from typing import Any

 from PIL import Image as PIL_Image

@ -184,16 +185,26 @@ class ChatFormat:
            content = content[: -len("<|eom_id|>")]
            stop_reason = StopReason.end_of_message

-        tool_name = None
-        tool_arguments = {}
+        tool_name: str | BuiltinTool | None = None
+        tool_arguments: dict[str, Any] = {}

        custom_tool_info = ToolUtils.maybe_extract_custom_tool_call(content)
        if custom_tool_info is not None:
-            tool_name, tool_arguments = custom_tool_info
+            # Type guard: ensure custom_tool_info is a tuple of correct types
+            if isinstance(custom_tool_info, tuple) and len(custom_tool_info) == 2:
+                extracted_tool_name, extracted_tool_arguments = custom_tool_info
+                # Handle both dict and str return types from the function
+                if isinstance(extracted_tool_arguments, dict):
+                    tool_name, tool_arguments = extracted_tool_name, extracted_tool_arguments
+                else:
+                    # If it's a string, treat it as a query parameter
+                    tool_name, tool_arguments = extracted_tool_name, {"query": extracted_tool_arguments}
+            else:
+                tool_name, tool_arguments = None, {}
            # Sometimes when agent has custom tools alongside builin tools
            # Agent responds for builtin tool calls in the format of the custom tools
            # This code tries to handle that case
-            if tool_name in BuiltinTool.__members__:
+            if tool_name is not None and tool_name in BuiltinTool.__members__:
                tool_name = BuiltinTool[tool_name]
                if isinstance(tool_arguments, dict):
                    tool_arguments = {
--- a/llama_stack/models/llama/llama3_3/prompts.py
+++ b/llama_stack/models/llama/llama3_3/prompts.py
@ -178,6 +178,7 @@ def usecases() -> list[UseCase | str]:
                    ),
                    RawMessage(role="user", content="What is the 100th decimal of pi?"),
                    RawMessage(
+                        role="assistant",
                        content="",
                        stop_reason=StopReason.end_of_message,
                        tool_calls=[
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -24,8 +24,8 @@ class ShieldRunnerMixin:
    def __init__(
        self,
        safety_api: Safety,
-        input_shields: list[str] = None,
-        output_shields: list[str] = None,
+        input_shields: list[str] | None = None,
+        output_shields: list[str] | None = None,
    ):
        self.safety_api = safety_api
        self.input_shields = input_shields
@ -37,6 +37,7 @@ class ShieldRunnerMixin:
                return await self.safety_api.run_shield(
                    shield_id=identifier,
                    messages=messages,
+                    params={},
                )

        responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])
--- a/Show more
+++ b/Show more