Merge branch 'main' into allow-dynamic-models-ollama

2025-07-25 21:57:45 +00:00 · 2025-07-21 05:17:29 -04:00 · 2025-07-21 05:17:29 -04:00 · c67bae2d07
commit c67bae2d07
parent 89b1052806 28956f9447
145 changed files with 6481 additions and 5159 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -4,3 +4,9 @@ omit =
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
+    */llama_stack/cli/scripts/*
+    */llama_stack/ui/*
+    */llama_stack/distribution/ui/*
+    */llama_stack/strong_typing/*
+    */llama_stack/env.py
+    */__init__.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
--- a/.github/ISSUE_TEMPLATE/tech-debt.yml
+++ b/.github/ISSUE_TEMPLATE/tech-debt.yml
@ -0,0 +1,30 @@
+name: 🔧 Tech Debt
+description: Something that is functional but should be improved or optimizied
+labels: ["tech-debt"]
+body:
+- type: textarea
+  id: tech-debt-explanation
+  attributes:
+    label: 🤔 What is the technical debt you think should be addressed?
+    description: >
+      A clear and concise description of _what_ needs to be addressed - ensure you are describing
+      constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
+      or feature request.
+  validations:
+    required: true
+
+- type: textarea
+  id: tech-debt-motivation
+  attributes:
+    label: 💡 What is the benefit of addressing this technical debt?
+    description: >
+      A clear and concise description of _why_ this work is needed.
+  validations:
+    required: true
+
+- type: textarea
+  id: other-thoughts
+  attributes:
+    label: Other thoughts
+    description: >
+      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -7,7 +7,5 @@ runs:
      shell: bash
      run: |
        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
-        # TODO: rebuild an ollama image with llama-guard3:1b
        echo "Verifying Ollama status..."
        timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
-        docker exec ollama ollama pull llama-guard3:1b
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -5,6 +5,10 @@ inputs:
    description: The Python version to use
    required: false
    default: "3.12"
+  client-version:
+    description: The llama-stack-client-python version to test against (latest or published)
+    required: false
+    default: "latest"
 runs:
  using: "composite"
  steps:
@ -20,8 +24,17 @@ runs:
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
-        # always test against the latest version of the client
-        # TODO: this is not necessarily a good idea. we need to test against both published and latest
-        # to find out backwards compatibility issues.
-        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+
+        # Install llama-stack-client-python based on the client-version input
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
+          echo "Installing latest llama-stack-client-python from main branch"
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          echo "Installing published llama-stack-client-python from PyPI"
+          uv pip install llama-stack-client
+        else
+          echo "Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
+
        uv pip install -e .
--- a/.github/workflows/coverage-badge.yml
+++ b/.github/workflows/coverage-badge.yml
@ -0,0 +1,57 @@
+name: Coverage Badge
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/unit/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/unit-tests.yml'
+      - '.github/workflows/coverage-badge.yml' # This workflow
+  workflow_dispatch:
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Run unit tests
+        run: |
+          ./scripts/unit-tests.sh
+
+      - name: Coverage Badge
+        uses: tj-actions/coverage-badge-py@1788babcb24544eb5bbb6e0d374df5d1e54e670f # v2.0.4
+
+      - name: Verify Changed files
+        uses: tj-actions/verify-changed-files@a1c6acee9df209257a246f2cc6ae8cb6581c1edf # v20.0.4
+        id: verify-changed-files
+        with:
+          files: coverage.svg
+
+      - name: Commit files
+        if: steps.verify-changed-files.outputs.files_changed == 'true'
+        run: |
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+          git add coverage.svg
+          git commit -m "Updated coverage.svg"
+
+      - name: Create Pull Request
+        if: steps.verify-changed-files.outputs.files_changed == 'true'
+        uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: "ci: [Automatic] Coverage Badge Update"
+          body: |
+            This PR updates the coverage badge based on the latest coverage report.
+
+            Automatically generated by the [workflow coverage-badge.yaml](.github/workflows/coverage-badge.yaml)
+          delete-branch: true
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@ -1,355 +0,0 @@
-name: "Run Llama-stack Tests"
-
-on:
-  #### Temporarily disable PR runs until tests run as intended within mainline.
-  #TODO Add this back.
-  #pull_request_target:
-  #  types: ["opened"]
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'llama_stack/**/*.py'
-  #    - 'tests/**/*.py'
-
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'GHA Runner Scale Set label to run workflow on.'
-        required: true
-        default: "llama-stack-gha-runner-gpu"
-
-      checkout_reference:
-        description: "The branch, tag, or SHA to checkout"
-        required: true
-        default: "main"
-
-      debug:
-        description: 'Run debugging steps?'
-        required: false
-        default: "true"
-
-      sleep_time:
-        description: '[DEBUG] sleep time for debugging'
-        required: true
-        default: "0"
-
-      provider_id:
-        description: 'ID of your provider'
-        required: true
-        default: "meta_reference"
-
-      model_id:
-        description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
-        required: true
-        default: "llama_3b"
-
-      model_override_3b:
-        description: 'Specify shorthand model for <llama_3b> '
-        required: false
-        default: "Llama3.2-3B-Instruct"
-
-      model_override_8b:
-        description: 'Specify shorthand model for <llama_8b> '
-        required: false
-        default: "Llama3.1-8B-Instruct"
-
-env:
-  # ID used for each test's provider config
-  PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
-
-  # Path to model checkpoints within EFS volume
-  MODEL_CHECKPOINT_DIR: "/data/llama"
-
-  # Path to directory to run tests from
-  TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
-
-  # Keep track of a list of model IDs that are valid to use within pytest fixture marks
-  AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
-
-  # Shorthand name for model ID, used in pytest fixture marks
-  MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
-
-  # Override the `llama_3b` / `llama_8b' models, else use the default.
-  LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
-  LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
-
-  # Defines which directories in TESTS_PATH to exclude from the test loop
-  EXCLUDED_DIRS: "__pycache__"
-
-  # Defines the output xml reports generated after a test is run
-  REPORTS_GEN: ""
-
-jobs:
-  execute_workflow:
-    name: Execute workload on Self-Hosted GPU k8s runner
-    permissions:
-      pull-requests: write
-    defaults:
-      run:
-        shell: bash
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
-    if: always()
-    steps:
-
-      ##############################
-      #### INITIAL DEBUG CHECKS ####
-      ##############################
-      - name: "[DEBUG] Check content of the EFS mount"
-        id: debug_efs_volume
-        continue-on-error: true
-        if: inputs.debug == 'true'
-        run: |
-            echo "========= Content of the EFS mount ============="
-            ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
-
-      - name: "[DEBUG] Get runner container OS information"
-        id: debug_os_info
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            cat /etc/os-release
-
-      - name: "[DEBUG] Print environment variables"
-        id: debug_env_vars
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            echo "PROVIDER_ID = ${PROVIDER_ID}"
-            echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
-            echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
-            echo "MODEL_ID = ${MODEL_ID}"
-            echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
-            echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
-            echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
-            echo "REPORTS_GEN = ${REPORTS_GEN}"
-
-      ############################
-      #### MODEL INPUT CHECKS ####
-      ############################
-
-      - name: "Check if env.model_id is valid"
-        id: check_model_id
-        run: |
-          if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
-            echo "Model ID '${MODEL_ID}' is valid."
-          else
-            echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
-            exit 1
-          fi
-
-      #######################
-      #### CODE CHECKOUT ####
-      #######################
-      - name: "Checkout 'meta-llama/llama-stack' repository"
-        id: checkout_repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: ${{ inputs.branch }}
-
-      - name: "[DEBUG] Content of the repository after checkout"
-        id: debug_content_after_checkout
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-            ls -la ${GITHUB_WORKSPACE}
-
-      ##########################################################
-      ####              OPTIONAL SLEEP DEBUG                ####
-      #                                                        #
-      # Use to "exec" into the test k8s POD and run tests      #
-      # manually to identify what dependencies are being used. #
-      #                                                        #
-      ##########################################################
-      - name: "[DEBUG] sleep"
-        id: debug_sleep
-        if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
-        run: |
-            sleep ${{ inputs.sleep_time }}
-
-      ############################
-      #### UPDATE SYSTEM PATH ####
-      ############################
-      - name: "Update path: execute"
-        id: path_update_exec
-        run: |
-          # .local/bin is needed for certain libraries installed below to be recognized
-          # when calling their executable to install sub-dependencies
-          mkdir -p ${HOME}/.local/bin
-          echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
-
-      #####################################
-      #### UPDATE CHECKPOINT DIRECTORY ####
-      #####################################
-      - name: "Update checkpoint directory"
-        id: checkpoint_update
-        run: |
-          echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
-          if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
-            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
-          elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
-            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
-          else
-            echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
-            exit 1
-          fi
-
-      - name: "[DEBUG] Checkpoint update check"
-        id: debug_checkpoint_update
-        if: ${{ inputs.debug == 'true' }}
-        run: |
-          echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
-
-      ##################################
-      #### DEPENDENCY INSTALLATIONS ####
-      ##################################
-      - name: "Installing 'apt' required packages"
-        id: install_apt
-        run: |
-          echo "[STEP] Installing 'apt' required packages"
-          sudo apt update -y
-          sudo apt install -y python3 python3-pip npm wget
-
-      - name: "Installing packages with 'curl'"
-        id: install_curl
-        run: |
-          curl -fsSL https://ollama.com/install.sh | sh
-
-      - name: "Installing packages with 'wget'"
-        id: install_wget
-        run: |
-          wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-          chmod +x Miniconda3-latest-Linux-x86_64.sh
-          ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
-          # Add miniconda3 bin to system path
-          echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
-
-      - name: "Installing packages with 'npm'"
-        id: install_npm_generic
-        run: |
-          sudo npm install -g junit-merge
-
-      - name: "Installing pip dependencies"
-        id: install_pip_generic
-        run: |
-          echo "[STEP] Installing 'llama-stack' models"
-          pip install -U pip setuptools
-          pip install -r requirements.txt
-          pip install -e .
-          pip install -U \
-            torch torchvision \
-            pytest pytest_asyncio \
-            fairscale lm-format-enforcer \
-            zmq chardet pypdf \
-            pandas sentence_transformers together \
-            aiosqlite
-      - name: "Installing packages with conda"
-        id: install_conda_generic
-        run: |
-          conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
-
-      #############################################################
-      #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
-      #############################################################
-      - name: "Run Tests: Loop"
-        id: run_tests_loop
-        working-directory: "${{ github.workspace }}"
-        run: |
-          pattern=""
-          for dir in llama_stack/providers/tests/*; do
-            if [ -d "$dir" ]; then
-              dir_name=$(basename "$dir")
-              if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
-                for file in "$dir"/test_*.py; do
-                  test_name=$(basename "$file")
-                  new_file="result-${dir_name}-${test_name}.xml"
-                  if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
-                     --junitxml="${{ github.workspace }}/${new_file}"; then
-                    echo "Ran test: ${test_name}"
-                  else
-                    echo "Did NOT run test: ${test_name}"
-                  fi
-                  pattern+="${new_file} "
-                done
-              fi
-            fi
-          done
-          echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
-
-      - name: "Test Summary: Merge"
-        id: test_summary_merge
-        working-directory: "${{ github.workspace }}"
-        run: |
-          echo "Merging the following test result files: ${REPORTS_GEN}"
-          # Defaults to merging them into 'merged-test-results.xml'
-          junit-merge ${{ env.REPORTS_GEN }}
-
-      ############################################
-      #### AUTOMATIC TESTING ON PULL REQUESTS ####
-      ############################################
-
-      #### Run tests ####
-
-      - name: "PR - Run Tests"
-        id: pr_run_tests
-        working-directory: "${{ github.workspace }}"
-        if: github.event_name == 'pull_request_target'
-        run: |
-          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
-          # (Optional) Add more tests here.
-
-          # Merge test results with 'merged-test-results.xml' from above.
-          # junit-merge <new-test-results> merged-test-results.xml
-
-      #### Create test summary ####
-
-      - name: "PR - Test Summary"
-        id: pr_test_summary_create
-        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
-        with:
-          paths: "${{ github.workspace }}/merged-test-results.xml"
-          output: test-summary.md
-
-      - name: "PR - Upload Test Summary"
-        id: pr_test_summary_upload
-        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: test-summary
-          path: test-summary.md
-
-      #### Update PR request ####
-
-      - name: "PR - Update comment"
-        id: pr_update_comment
-        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
-        with:
-          filePath: test-summary.md
-
-      ########################
-      #### MANUAL TESTING ####
-      ########################
-
-      #### Run tests ####
-
-      - name: "Manual - Run Tests: Prep"
-        id: manual_run_tests
-        working-directory: "${{ github.workspace }}"
-        if: github.event_name == 'workflow_dispatch'
-        run: |
-          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
-
-          #TODO Use this when collection errors are resolved
-          # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
-
-          # (Optional) Add more tests here.
-
-          # Merge test results with 'merged-test-results.xml' from above.
-          # junit-merge <new-test-results> merged-test-results.xml
-
-      #### Create test summary ####
-
-      - name: "Manual - Test Summary"
-        id: manual_test_summary
-        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
-        with:
-          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -7,11 +7,20 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - 'tests/integration/**'
+      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
+      - '.github/actions/setup-ollama/action.yml'
+  schedule:
+    - cron: '0 0 * * *'  # Daily at 12 AM UTC
+  workflow_dispatch:
+    inputs:
+      test-all-client-versions:
+        description: 'Test against both the latest and published versions'
+        type: boolean
+        default: false

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@ -45,6 +54,7 @@ jobs:
        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
        client-type: [library, server]
        python-version: ["3.12", "3.13"]
+        client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}

    steps:
      - name: Checkout repository
@ -54,13 +64,14 @@ jobs:
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python-version }}
+          client-version: ${{ matrix.client-version }}

      - name: Setup ollama
        uses: ./.github/actions/setup-ollama

      - name: Build Llama Stack
        run: |
-          uv run llama stack build --template starter --image-type venv
+          uv run llama stack build --template ci-tests --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -81,15 +92,15 @@ jobs:
        shell: bash
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
-            stack_config="starter"
+            stack_config="ci-tests"
          else
-            stack_config="server:starter"
+            stack_config="server:ci-tests"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="ollama/llama3.2:3b-instruct-fp16" \
            --embedding-model=all-MiniLM-L6-v2 \
-            --safety-shield=ollama \
+            --safety-shield=$SAFETY_MODEL \
            --color=yes \
            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log

@ -108,7 +119,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -93,7 +93,7 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run llama stack build --template starter --image-type venv
+          uv run llama stack build --template ci-tests --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -97,9 +97,9 @@ jobs:

      - name: Build a single provider
        run: |
-          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
-          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
+          yq -i '.image_type = "container"' llama_stack/templates/ci-tests/build.yaml
+          yq -i '.image_name = "test"' llama_stack/templates/ci-tests/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml

      - name: Inspect the container image entrypoint
        run: |
@ -126,14 +126,14 @@ jobs:
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/templates/starter/build.yaml
+          ' llama_stack/templates/ci-tests/build.yaml

      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/templates/starter/build.yaml
+          uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml

      - name: Inspect UBI9 image
        run: |
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -20,7 +20,7 @@ jobs:
      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

    - name: Install uv
-      uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1
+      uses: astral-sh/setup-uv@7edac99f961f18b581bbd960d59d049f04c0002f # v6.4.1
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -1,69 +0,0 @@
-name: auto-tests
-
-on:
-  # pull_request:
-  workflow_dispatch:
-    inputs:
-      commit_sha:
-        description: 'Specific Commit SHA to trigger on'
-        required: false
-        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
-
-jobs:
-  test-llama-stack-as-library:
-    runs-on: ubuntu-latest
-    env:
-      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
-      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
-      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
-    strategy:
-      matrix:
-        provider: [fireworks, together]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: ${{ github.event.inputs.commit_sha }}
-
-      - name: Echo commit SHA
-        run: |
-          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
-          git rev-parse HEAD
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt pytest
-          pip install -e .
-
-      - name: Build providers
-        run: |
-          llama stack build --template ${{ matrix.provider }} --image-type venv
-
-      - name: Install the latest llama-stack-client & llama-models packages
-        run: |
-          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
-          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
-
-      - name: Run client-sdk test
-        working-directory: "${{ github.workspace }}"
-        env:
-          REPORT_OUTPUT: md_report.md
-        shell: bash
-        run: |
-          pip install --upgrade pytest-md-report
-          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
-
-          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
-
-      - name: Output reports to the job summary
-        if: always()
-        shell: bash
-        run: |
-          if [ -f "$REPORT_FILE" ]; then
-            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            echo "</details>" >> $GITHUB_STEP_SUMMARY
-          fi
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -36,7 +36,7 @@ jobs:

      - name: Run unit tests
        run: |
-          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml

      - name: Upload test results
        if: always()
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -129,6 +129,22 @@ repos:
        require_serial: true
        always_run: true
        files: ^llama_stack/.*$
+      - id: forbid-pytest-asyncio
+        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
+        entry: bash
+        language: system
+        types: [python]
+        pass_filenames: true
+        args:
+          - -c
+          - |
+            grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
+              echo;
+              echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
+              echo "   pytest is already configured with async-mode=auto."
+              echo;
+              exit 1;
+            } || true

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -112,7 +112,7 @@ uv run pre-commit run --all-files

 ## Running tests

-You can find the Llama Stack testing documentation here [here](tests/README.md).
+You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).

 ## Adding a new dependency to the project

--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
+![coverage badge](./coverage.svg)

 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)

--- a/coverage.svg
+++ b/coverage.svg
@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
+    <linearGradient id="b" x2="0" y2="100%">
+        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
+        <stop offset="1" stop-opacity=".1"/>
+    </linearGradient>
+    <mask id="a">
+        <rect width="99" height="20" rx="3" fill="#fff"/>
+    </mask>
+    <g mask="url(#a)">
+        <path fill="#555" d="M0 0h63v20H0z"/>
+        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
+        <path fill="url(#b)" d="M0 0h99v20H0z"/>
+    </g>
+    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
+        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
+        <text x="31.5" y="14">coverage</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
+        <text x="80" y="14">44%</text>
+    </g>
+</svg>
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -11340,6 +11340,9 @@
                    },
                    "embedding_dimension": {
                        "type": "integer"
+                    },
+                    "vector_db_name": {
+                        "type": "string"
                    }
                },
                "additionalProperties": false,
@ -13590,10 +13593,6 @@
                    "provider_id": {
                        "type": "string",
                        "description": "The ID of the provider to use for this vector store."
-                    },
-                    "provider_vector_db_id": {
-                        "type": "string",
-                        "description": "The provider-specific vector database ID."
                    }
                },
                "additionalProperties": false,
@ -14471,28 +14470,31 @@
            "DPOAlignmentConfig": {
                "type": "object",
                "properties": {
-                    "reward_scale": {
+                    "beta": {
                        "type": "number"
                    },
-                    "reward_clip": {
-                        "type": "number"
-                    },
-                    "epsilon": {
-                        "type": "number"
-                    },
-                    "gamma": {
-                        "type": "number"
+                    "loss_type": {
+                        "$ref": "#/components/schemas/DPOLossType",
+                        "default": "sigmoid"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "reward_scale",
-                    "reward_clip",
-                    "epsilon",
-                    "gamma"
+                    "beta",
+                    "loss_type"
                ],
                "title": "DPOAlignmentConfig"
            },
+            "DPOLossType": {
+                "type": "string",
+                "enum": [
+                    "sigmoid",
+                    "hinge",
+                    "ipo",
+                    "kto_pair"
+                ],
+                "title": "DPOLossType"
+            },
            "DataConfig": {
                "type": "object",
                "properties": {
@ -15634,6 +15636,10 @@
                        "type": "string",
                        "description": "The identifier of the provider."
                    },
+                    "vector_db_name": {
+                        "type": "string",
+                        "description": "The name of the vector database."
+                    },
                    "provider_vector_db_id": {
                        "type": "string",
                        "description": "The identifier of the vector database in the provider."
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -7984,6 +7984,8 @@ components:
          type: string
        embedding_dimension:
          type: integer
+        vector_db_name:
+          type: string
      additionalProperties: false
      required:
        - identifier
@ -9494,10 +9496,6 @@ components:
          type: string
          description: >-
            The ID of the provider to use for this vector store.
-        provider_vector_db_id:
-          type: string
-          description: >-
-            The provider-specific vector database ID.
      additionalProperties: false
      required:
        - name
@ -10113,21 +10111,24 @@ components:
    DPOAlignmentConfig:
      type: object
      properties:
-        reward_scale:
-          type: number
-        reward_clip:
-          type: number
-        epsilon:
-          type: number
-        gamma:
+        beta:
          type: number
+        loss_type:
+          $ref: '#/components/schemas/DPOLossType'
+          default: sigmoid
      additionalProperties: false
      required:
-        - reward_scale
-        - reward_clip
-        - epsilon
-        - gamma
+        - beta
+        - loss_type
      title: DPOAlignmentConfig
+    DPOLossType:
+      type: string
+      enum:
+        - sigmoid
+        - hinge
+        - ipo
+        - kto_pair
+      title: DPOLossType
    DataConfig:
      type: object
      properties:
@ -10945,6 +10946,9 @@ components:
        provider_id:
          type: string
          description: The identifier of the provider.
+        vector_db_name:
+          type: string
+          description: The name of the vector database.
        provider_vector_db_id:
          type: string
          description: >-
--- a/docs/source/advanced_apis/eval/index.md
+++ b/docs/source/advanced_apis/eval/index.md
@ -0,0 +1,6 @@
+# Eval Providers
+
+This section contains documentation for all available providers for the **eval** API.
+
+- [inline::meta-reference](inline_meta-reference.md)
+- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/eval/inline_meta-reference.md
+++ b/docs/source/advanced_apis/eval/inline_meta-reference.md
@ -0,0 +1,21 @@
+# inline::meta-reference
+
+## Description
+
+Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+
+## Sample Configuration
+
+```yaml
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
+
+```
+
--- a/docs/source/advanced_apis/eval/remote_nvidia.md
+++ b/docs/source/advanced_apis/eval/remote_nvidia.md
@ -0,0 +1,19 @@
+# remote::nvidia
+
+## Description
+
+NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
+
+## Sample Configuration
+
+```yaml
+evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
+
+```
+
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
--- a/docs/source/advanced_apis/index.md
+++ b/docs/source/advanced_apis/index.md
@ -0,0 +1,33 @@
+# Advanced APIs
+
+## Post-training
+Fine-tunes a model.
+
+```{toctree}
+:maxdepth: 1
+
+post_training/index
+```
+
+## Eval
+Generates outputs (via Inference or Agents) and perform scoring.
+
+```{toctree}
+:maxdepth: 1
+
+eval/index
+```
+
+```{include} evaluation_concepts.md
+:start-after: ## Evaluation Concepts
+```
+
+## Scoring
+Evaluates the outputs of the system.
+
+```{toctree}
+:maxdepth: 1
+
+scoring/index
+```
+
--- a/docs/source/advanced_apis/post_training/huggingface.md
+++ b/docs/source/advanced_apis/post_training/huggingface.md
--- a/docs/source/advanced_apis/post_training/index.md
+++ b/docs/source/advanced_apis/post_training/index.md
@ -0,0 +1,7 @@
+# Post_Training Providers
+
+This section contains documentation for all available providers for the **post_training** API.
+
+- [inline::huggingface](inline_huggingface.md)
+- [inline::torchtune](inline_torchtune.md)
+- [remote::nvidia](remote_nvidia.md)
--- a/docs/source/advanced_apis/post_training/inline_huggingface.md
+++ b/docs/source/advanced_apis/post_training/inline_huggingface.md
@ -0,0 +1,33 @@
+# inline::huggingface
+
+## Description
+
+HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `device` | `<class 'str'>` | No | cuda |  |
+| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
+| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
+| `chat_template` | `<class 'str'>` | No | |
+| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
+| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
+| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
+| `save_total_limit` | `<class 'int'>` | No | 3 |  |
+| `logging_steps` | `<class 'int'>` | No | 10 |  |
+| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
+| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
+| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
+| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: huggingface
+distributed_backend: null
+device: cpu
+
+```
+
--- a/docs/source/advanced_apis/post_training/inline_torchtune.md
+++ b/docs/source/advanced_apis/post_training/inline_torchtune.md
@ -0,0 +1,20 @@
+# inline::torchtune
+
+## Description
+
+TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `torch_seed` | `int \| None` | No |  |  |
+| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: meta
+
+```
+
--- a/docs/source/advanced_apis/post_training/nvidia_nemo.md
+++ b/docs/source/advanced_apis/post_training/nvidia_nemo.md
--- a/docs/source/advanced_apis/post_training/remote_nvidia.md
+++ b/docs/source/advanced_apis/post_training/remote_nvidia.md
@ -0,0 +1,28 @@
+# remote::nvidia
+
+## Description
+
+NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `api_key` | `str \| None` | No |  | The NVIDIA API key. |
+| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
+| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
+| `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
+| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
+| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
+| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
+
+## Sample Configuration
+
+```yaml
+api_key: ${env.NVIDIA_API_KEY:=}
+dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
+project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
+customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
+
+```
+
--- a/docs/source/advanced_apis/post_training/torchtune.md
+++ b/docs/source/advanced_apis/post_training/torchtune.md
--- a/docs/source/advanced_apis/scoring/index.md
+++ b/docs/source/advanced_apis/scoring/index.md
@ -0,0 +1,7 @@
+# Scoring Providers
+
+This section contains documentation for all available providers for the **scoring** API.
+
+- [inline::basic](inline_basic.md)
+- [inline::braintrust](inline_braintrust.md)
+- [inline::llm-as-judge](inline_llm-as-judge.md)
--- a/docs/source/advanced_apis/scoring/inline_basic.md
+++ b/docs/source/advanced_apis/scoring/inline_basic.md
@ -0,0 +1,13 @@
+# inline::basic
+
+## Description
+
+Basic scoring provider for simple evaluation metrics and scoring functions.
+
+## Sample Configuration
+
+```yaml
+{}
+
+```
+
--- a/docs/source/advanced_apis/scoring/inline_braintrust.md
+++ b/docs/source/advanced_apis/scoring/inline_braintrust.md
@ -0,0 +1,19 @@
+# inline::braintrust
+
+## Description
+
+Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `openai_api_key` | `str \| None` | No |  | The OpenAI API Key |
+
+## Sample Configuration
+
+```yaml
+openai_api_key: ${env.OPENAI_API_KEY:=}
+
+```
+
--- a/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
+++ b/docs/source/advanced_apis/scoring/inline_llm-as-judge.md
@ -0,0 +1,13 @@
+# inline::llm-as-judge
+
+## Description
+
+LLM-as-judge scoring provider that uses language models to evaluate and score responses.
+
+## Sample Configuration
+
+```yaml
+{}
+
+```
+
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications (Examples)
+# AI Application Examples

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

@ -27,4 +27,5 @@ tools
 evals
 telemetry
 safety
-```
+playground/index
+```
--- a/docs/source/building_applications/playground/index.md
+++ b/docs/source/building_applications/playground/index.md
@ -1,4 +1,4 @@
-# Llama Stack Playground
+## Llama Stack Playground

 ```{note}
 The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
 - Demo **end-to-end** application code to help users get started to build their own applications
 - Provide an **UI** to help users inspect and understand Llama Stack API providers and resources

-## Key Features
+### Key Features

 #### Playground
 Interactive pages for users to play with and explore Llama Stack API capabilities.
@ -90,7 +90,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
  - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
  - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.

-## Starting the Llama Stack Playground
+### Starting the Llama Stack Playground

 To start the Llama Stack Playground, run the following commands:

--- a/docs/source/concepts/architecture.md
+++ b/docs/source/concepts/architecture.md
@ -1,31 +1,39 @@
-# Why Llama Stack?
+## Llama Stack architecture

-Building production AI applications today requires solving multiple challenges:
-
-**Infrastructure Complexity**
- Running large language models efficiently requires specialized infrastructure.
- Different deployment scenarios (local development, cloud, edge) need different solutions.
- Moving from development to production often requires significant rework.
-
-**Essential Capabilities**
- Safety guardrails and content filtering are necessary in an enterprise setting.
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
- Nearly any application needs composable multi-step workflows.
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
-
-**Lack of Flexibility and Choice**
- Directly integrating with multiple providers creates tight coupling.
- Different providers have different APIs and abstractions.
- Changing providers requires significant code changes.
-
-
-### Our Solution: A Universal Stack
+Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.

 ```{image} ../../_static/llama-stack.png
 :alt: Llama Stack
 :width: 400px
 ```

+### Benefits of Llama stack
+
+#### Current challenges in custom AI applications
+
+Building production AI applications today requires solving multiple challenges:
+
+**Infrastructure Complexity**
+
+- Running large language models efficiently requires specialized infrastructure.
+- Different deployment scenarios (local development, cloud, edge) need different solutions.
+- Moving from development to production often requires significant rework.
+
+**Essential Capabilities**
+
+- Safety guardrails and content filtering are necessary in an enterprise setting.
+- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
+- Nearly any application needs composable multi-step workflows.
+- Without monitoring, observability and evaluation, you end up operating in the dark.
+
+**Lack of Flexibility and Choice**
+
+- Directly integrating with multiple providers creates tight coupling.
+- Different providers have different APIs and abstractions.
+- Changing providers requires significant code changes.
+
+#### Our Solution: A Universal Stack
+
 Llama Stack addresses these challenges through a service-oriented, API-first approach:

 **Develop Anywhere, Deploy Everywhere**
@ -59,4 +67,4 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
 - **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios


-With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
+With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -2,6 +2,10 @@

 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.

+```{include} architecture.md
+:start-after: ## Llama Stack architecture
+```
+
 ```{include} apis.md
 :start-after: ## APIs
 ```
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
 :start-after: ## API Providers
 ```

-```{include} resources.md
-:start-after: ## Resources
-```
-
 ```{include} distributions.md
 :start-after: ## Distributions
 ```

-```{include} evaluation_concepts.md
-:start-after: ## Evaluation Concepts
+```{include} resources.md
+:start-after: ## Resources
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -52,7 +52,18 @@ extensions = [
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
+    "sphinx_reredirects"
 ]
+
+redirects = {
+    "providers/post_training/index": "../../advanced_apis/post_training/index.html",
+    "providers/eval/index": "../../advanced_apis/eval/index.html",
+    "providers/scoring/index": "../../advanced_apis/scoring/index.html",
+    "playground/index": "../../building_applications/playground/index.html",
+    "openai/index": "../../providers/index.html#openai-api-compatibility",
+    "introduction/index": "../concepts/index.html#llama-stack-architecture"
+}
+
 myst_enable_extensions = ["colon_fence"]

 html_theme = "sphinx_rtd_theme"
--- a/docs/source/deploying/index.md
+++ b/docs/source/deploying/index.md
@ -0,0 +1,4 @@
+# Deployment Examples
+
+```{include} kubernetes_deployment.md
+```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,4 +1,4 @@
-# Kubernetes Deployment Guide
+## Kubernetes Deployment Guide

 Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.

@ -222,10 +222,21 @@ llama-stack-client --endpoint http://localhost:5000 inference chat-completion --

 ## Deploying Llama Stack Server in AWS EKS

-We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster. Once you have an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html), you can run the following script to deploy the Llama Stack server.
+We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster.
+
+Prerequisites:
+- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html).
+- Create a [Github OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) and get the client ID and client secret.
+  - Set the `Authorization callback URL` to `http://<your-llama-stack-ui-url>/api/auth/callback/`


+Run the following script to deploy the Llama Stack server:
 ```
+export HF_TOKEN=<your-huggingface-token>
+export GITHUB_CLIENT_ID=<your-github-client-id>
+export GITHUB_CLIENT_SECRET=<your-github-client-secret>
+export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
+
 cd docs/source/distributions/eks
 ./apply.sh
 ```
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -6,14 +6,9 @@ This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
 :maxdepth: 3
-
+list_of_distributions
+building_distro
+customizing_run_yaml
 importing_as_library
 configuration
-customizing_run_yaml
-list_of_distributions
-kubernetes_deployment
-building_distro
-on_device_distro
-remote_hosted_distro
-self_hosted_distro
 ```
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -21,6 +21,24 @@ else
  exit 1
 fi

+if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
+  echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
+  echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
+  echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+  exit 1
+fi
+
+
+
+
 set -euo pipefail
 set -x

--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -122,6 +122,9 @@ data:
      provider_id: rag-runtime
    server:
      port: 8321
+      auth:
+        provider_config:
+          type: github_token
 kind: ConfigMap
 metadata:
  creationTimestamp: null
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -27,7 +27,7 @@ spec:
    spec:
      containers:
      - name: llama-stack
-        image: llamastack/distribution-remote-vllm:latest
+        image: llamastack/distribution-starter:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -119,3 +119,6 @@ tool_groups:
  provider_id: rag-runtime
 server:
  port: 8321
+  auth:
+    provider_config:
+      type: github_token
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -26,6 +26,12 @@ spec:
          value: "http://llama-stack-service:8321"
        - name: LLAMA_STACK_UI_PORT
          value: "8322"
+        - name: GITHUB_CLIENT_ID
+          value: "${GITHUB_CLIENT_ID}"
+        - name: GITHUB_CLIENT_SECRET
+          value: "${GITHUB_CLIENT_SECRET}"
+        - name: NEXTAUTH_URL
+          value: "${LLAMA_STACK_UI_URL}:8322"
        args:
          - -c
          - |
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -167,7 +167,7 @@ When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`),

 ## Running the Distribution

-You can run the starter distribution via Docker or Conda.
+You can run the starter distribution via Docker, Conda, or venv.

 ### Via Docker

@ -186,17 +186,12 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

-### Via Conda
+### Via Conda or venv

-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+Ensure you have configured the starter distribution using the environment variables explained above.

 ```bash
-llama stack build --template starter --image-type conda
-llama stack run distributions/starter/run.yaml \
-  --port 8321 \
-  --env OPENAI_API_KEY=your_openai_key \
-  --env FIREWORKS_API_KEY=your_fireworks_key \
-  --env TOGETHER_API_KEY=your_together_key
+uv run --with llama-stack llama stack build --template starter --image-type <conda|venv> --run
 ```

 ## Example Usage
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -28,5 +28,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste

 importing_as_library
 configuration
-kubernetes_deployment
 ```
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -1,4 +1,4 @@
-# Detailed Tutorial
+## Detailed Tutorial

 In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
 A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
@ -10,7 +10,7 @@ Llama Stack is a stateful service with REST APIs to support seamless transition
 In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
 as the inference [provider](../providers/index.md#inference) for a Llama Model.

-## Step 1: Installation and Setup
+### Step 1: Installation and Setup

 Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
 download Llama 3.2 3B model, and then start the Ollama service.
@ -45,7 +45,7 @@ Setup your virtual environment.
 uv sync --python 3.12
 source .venv/bin/activate
 ```
-## Step 2:  Run Llama Stack
+### Step 2:  Run Llama Stack
 Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.

 ::::{tab-set}
@ -132,7 +132,7 @@ Now you can use the Llama Stack client to run inference and build agents!
 You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
 Note that the client package is already included in the `llama-stack` package.

-## Step 3: Run Client CLI
+### Step 3: Run Client CLI

 Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
 existing server virtual environment.
@ -232,7 +232,7 @@ OpenAIChatCompletion(
 )
 ```

-## Step 4: Run the Demos
+### Step 4: Run the Demos

 Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
 Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
@ -242,7 +242,7 @@ Other SDKs are also available, please refer to the [Client SDK](../index.md#clie
 :::{tab-item} Basic Inference
 Now you can run inference using the Llama Stack client SDK.

-### i. Create the Script
+#### i. Create the Script

 Create a file `inference.py` and add the following code:
 ```python
@ -269,7 +269,7 @@ response = client.chat.completions.create(
 print(response)
 ```

-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python inference.py
@ -283,7 +283,7 @@ OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices

 :::{tab-item} Build a Simple Agent
 Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
-### i. Create the Script
+#### i. Create the Script
 Create a file `agent.py` and add the following code:

 ```python
@ -455,7 +455,7 @@ uv run python agent.py

 For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
 in a vector database.
-### i. Create the Script
+#### i. Create the Script
 Create a file `rag_agent.py` and add the following code:

 ```python
@ -533,7 +533,7 @@ for t in turns:
    for event in AgentEventLogger().log(stream):
        event.print()
 ```
-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python rag_agent.py
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,123 +1,13 @@
-# Quickstart
+# Getting Started

-Get started with Llama Stack in minutes!
-
-Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
-environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
-
-In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/inference/index) for a Llama Model.
-
-**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
-
-#### Step 1: Install and setup
-1. Install [uv](https://docs.astral.sh/uv/)
-2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
-```bash
-ollama run llama3.2:3b --keepalive 60m
+```{include} quickstart.md
+:start-after: ## Quickstart
 ```
-#### Step 2: Run the Llama Stack server
-We will use `uv` to run the Llama Stack server.
-```bash
-INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
+
+```{include} libraries.md
+:start-after: ## Libraries (SDKs)
 ```
-#### Step 3: Run the demo
-Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python
-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
-
-vector_db_id = "my_demo_vector_db"
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-models = client.models.list()
-
-# Select the first LLM and first embedding models
-model_id = next(m for m in models if m.model_type == "llm").identifier
-embedding_model_id = (
-    em := next(m for m in models if m.model_type == "embedding")
-).identifier
-embedding_dimension = em.metadata["embedding_dimension"]
-
-_ = client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model=embedding_model_id,
-    embedding_dimension=embedding_dimension,
-    provider_id="faiss",
-)
-source = "https://www.paulgraham.com/greatwork.html"
-print("rag_tool> Ingesting document:", source)
-document = RAGDocument(
-    document_id="document_1",
-    content=source,
-    mime_type="text/html",
-    metadata={},
-)
-client.tool_runtime.rag_tool.insert(
-    documents=[document],
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
-)
-agent = Agent(
-    client,
-    model=model_id,
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
-        }
-    ],
-)
-
-prompt = "How do you do great work?"
-print("prompt>", prompt)
-
-response = agent.create_turn(
-    messages=[{"role": "user", "content": prompt}],
-    session_id=agent.create_session("rag_session"),
-    stream=True,
-)
-
-for log in AgentEventLogger().log(response):
-    log.print()
+```{include} detailed_tutorial.md
+:start-after: ## Detailed Tutorial
 ```
-We will use `uv` to run the script
-```
-uv run --with llama-stack-client,fire,requests demo_script.py
-```
-And you should see output like below.
-```
-rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
-
-prompt> How do you do great work?
-
-inference> [knowledge_search(query="What is the key to doing great work")]
-
-tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
-
-tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
-
-inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
-
-To further clarify, I would suggest that doing great work involves:
-
-* Completing tasks with high quality and attention to detail
-* Expanding on existing knowledge or ideas
-* Making a positive impact on others through your work
-* Striving for excellence and continuous improvement
-
-Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
-```
-Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
-
-## Next Steps
-
-Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
--- a/docs/source/getting_started/libraries.md
+++ b/docs/source/getting_started/libraries.md
@ -0,0 +1,10 @@
+## Libraries (SDKs)
+
+We have a number of client-side SDKs available for different languages.
+
+|  **Language** |  **Client SDK** | **Package** |
+| :----: | :----: | :----: |
+| Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
+| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
+| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
+| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -0,0 +1,129 @@
+## Quickstart
+
+Get started with Llama Stack in minutes!
+
+Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
+environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
+
+In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
+as the inference [provider](../providers/inference/index) for a Llama Model.
+
+**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
+
+#### Step 1: Install and setup
+1. Install [uv](https://docs.astral.sh/uv/)
+2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
+```bash
+ollama run llama3.2:3b --keepalive 60m
+```
+#### Step 2: Run the Llama Stack server
+We will use `uv` to run the Llama Stack server.
+```bash
+ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
+```
+#### Step 3: Run the demo
+Now open up a new terminal and copy the following script into a file named `demo_script.py`.
+
+```python
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
+
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+_ = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=50,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
+)
+
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=True,
+)
+
+for log in AgentEventLogger().log(response):
+    log.print()
+```
+We will use `uv` to run the script
+```
+uv run --with llama-stack-client,fire,requests demo_script.py
+```
+And you should see output like below.
+```
+rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
+
+prompt> How do you do great work?
+
+inference> [knowledge_search(query="What is the key to doing great work")]
+
+tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
+
+tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
+
+inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
+
+To further clarify, I would suggest that doing great work involves:
+
+* Completing tasks with high quality and attention to detail
+* Expanding on existing knowledge or ideas
+* Making a positive impact on others through your work
+* Striving for excellence and continuous improvement
+
+Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
+```
+Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
+
+```{admonition} HuggingFace access
+:class: tip
+
+If you are getting a **401 Client Error** from HuggingFace for the **all-MiniLM-L6-v2** model, try setting **HF_TOKEN** to a valid HuggingFace token in your environment
+```
+
+### Next Steps
+
+Now you're ready to dive deeper into Llama Stack!
+- Explore the [Detailed Tutorial](./detailed_tutorial.md).
+- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
+- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
+- Learn about Llama Stack [Concepts](../concepts/index.md).
+- Discover how to [Build Llama Stacks](../distributions/index.md).
+- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
+- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -40,17 +40,6 @@ Kotlin.
 - Ready to build? Check out the [Quick Start](getting_started/index) to get started.
 - Want to contribute? See the [Contributing](contributing/index) guide.

-## Client SDKs
-
-We have a number of client-side SDKs available for different languages.
-
-|  **Language** |  **Client SDK** | **Package** |
-| :----: | :----: | :----: |
-| Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
-| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
-| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
-| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
-
 ## Supported Llama Stack Implementations

 A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
@ -133,14 +122,12 @@ A number of "adapters" are available for some popular Inference and Vector Store

 self
 getting_started/index
-getting_started/detailed_tutorial
-introduction/index
 concepts/index
-openai/index
 providers/index
 distributions/index
+advanced_apis/index
 building_applications/index
-playground/index
+deploying/index
 contributing/index
 references/index
 ```
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -1,4 +1,4 @@
-# Providers Overview
+# API Providers Overview

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
@ -13,13 +13,25 @@ Providers come in two flavors:
 Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.

 ## External Providers
-
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.

 ```{toctree}
 :maxdepth: 1

-external
+external.md
+```
+
+```{include} openai.md
+:start-after: ## OpenAI API Compatibility
+```
+
+## Inference
+Runs inference with an LLM.
+
+```{toctree}
+:maxdepth: 1
+
+inference/index
 ```

 ## Agents
@ -40,33 +52,6 @@ Interfaces with datasets and data loaders.
 datasetio/index
 ```

-## Eval
-Generates outputs (via Inference or Agents) and perform scoring.
-
-```{toctree}
-:maxdepth: 1
-
-eval/index
-```
-
-## Inference
-Runs inference with an LLM.
-
-```{toctree}
-:maxdepth: 1
-
-inference/index
-```
-
-## Post Training
-Fine-tunes a model.
-
-```{toctree}
-:maxdepth: 1
-
-post_training/index
-```
-
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.

@ -76,15 +61,6 @@ Applies safety policies to the output at a Systems (not only model) level.
 safety/index
 ```

-## Scoring
-Evaluates the outputs of the system.
-
-```{toctree}
-:maxdepth: 1
-
-scoring/index
-```
-
 ## Telemetry
 Collects telemetry data from the system.

@ -94,15 +70,6 @@ Collects telemetry data from the system.
 telemetry/index
 ```

-## Tool Runtime
-Is associated with the ToolGroup resouces.
-
-```{toctree}
-:maxdepth: 1
-
-tool_runtime/index
-```
-
 ## Vector IO

 Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
@ -114,3 +81,12 @@ io and database are used to store and retrieve documents for retrieval.

 vector_io/index
 ```
+
+## Tool Runtime
+Is associated with the ToolGroup resources.
+
+```{toctree}
+:maxdepth: 1
+
+tool_runtime/index
+```
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -4,7 +4,6 @@ This section contains documentation for all available providers for the **infere

 - [inline::meta-reference](inline_meta-reference.md)
 - [inline::sentence-transformers](inline_sentence-transformers.md)
- [inline::vllm](inline_vllm.md)
 - [remote::anthropic](remote_anthropic.md)
 - [remote::bedrock](remote_bedrock.md)
 - [remote::cerebras](remote_cerebras.md)
--- a/docs/source/providers/inference/inline_vllm.md
+++ b/docs/source/providers/inference/inline_vllm.md
@ -1,29 +0,0 @@
-# inline::vllm
-
-## Description
-
-vLLM inference provider for high-performance model serving with PagedAttention and continuous batching.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `tensor_parallel_size` | `<class 'int'>` | No | 1 | Number of tensor parallel replicas (number of GPUs to use). |
-| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
-| `max_model_len` | `<class 'int'>` | No | 4096 | Maximum context length to use during serving. |
-| `max_num_seqs` | `<class 'int'>` | No | 4 | Maximum parallel batch size for generation. |
-| `enforce_eager` | `<class 'bool'>` | No | False | Whether to use eager mode for inference (otherwise cuda graphs are used). |
-| `gpu_memory_utilization` | `<class 'float'>` | No | 0.3 | How much GPU memory will be allocated when this provider has finished loading, including memory that was already allocated before loading. |
-
-## Sample Configuration
-
-```yaml
-tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:=1}
-max_tokens: ${env.MAX_TOKENS:=4096}
-max_model_len: ${env.MAX_MODEL_LEN:=4096}
-max_num_seqs: ${env.MAX_NUM_SEQS:=4}
-enforce_eager: ${env.ENFORCE_EAGER:=False}
-gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:=0.3}
-
-```
-
--- a/docs/source/providers/inference/remote_ollama.md
+++ b/docs/source/providers/inference/remote_ollama.md
@ -9,6 +9,8 @@ Ollama inference provider for running local models through the Ollama runtime.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `<class 'str'>` | No | http://localhost:11434 |  |
+| `refresh_models` | `<class 'bool'>` | No | False | refresh and re-register models periodically |
+| `refresh_models_interval` | `<class 'int'>` | No | 300 | interval in seconds to refresh models |

 ## Sample Configuration

--- a/docs/source/providers/inference/remote_vllm.md
+++ b/docs/source/providers/inference/remote_vllm.md
@ -12,11 +12,13 @@ Remote vLLM inference provider for connecting to vLLM servers.
 | `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
 | `api_token` | `str \| None` | No | fake | The API token |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |
+| `refresh_models_interval` | `<class 'int'>` | No | 300 | Interval in seconds to refresh models |

 ## Sample Configuration

 ```yaml
-url: ${env.VLLM_URL}
+url: ${env.VLLM_URL:=}
 max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
 api_token: ${env.VLLM_API_TOKEN:=fake}
 tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/docs/source/providers/openai.md
+++ b/docs/source/providers/openai.md
@ -1,14 +1,14 @@
-# OpenAI API Compatibility
+## OpenAI API Compatibility

-## Server path
+### Server path

 Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.

-## Clients
+### Clients

 You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.

-### Llama Stack Client
+#### Llama Stack Client

 When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.

@ -18,7 +18,7 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url="http://localhost:8321")
 ```

-### OpenAI Client
+#### OpenAI Client

 When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.

@ -30,9 +30,9 @@ client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")

 Regardless of the client you choose, the following code examples should all work the same.

-## APIs implemented
+### APIs implemented

-### Models
+#### Models

 Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:

@ -40,13 +40,13 @@ Many of the APIs require you to pass in a model parameter. To see the list of mo
 models = client.models.list()
 ```

-### Responses
+#### Responses

 :::{note}
 The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
 :::

-#### Simple inference
+##### Simple inference

 Request:

@ -66,7 +66,7 @@ Syntax whispers secrets sweet
 Code's gentle silence
 ```

-#### Structured Output
+##### Structured Output

 Request:

@ -106,9 +106,9 @@ Example output:
 { "participants": ["Alice", "Bob"] }
 ```

-### Chat Completions
+#### Chat Completions

-#### Simple inference
+##### Simple inference

 Request:

@ -129,7 +129,7 @@ Logic flows like a river
 Code's gentle beauty
 ```

-#### Structured Output
+##### Structured Output

 Request:

@ -170,9 +170,9 @@ Example output:
 { "participants": ["Alice", "Bob"] }
 ```

-### Completions
+#### Completions

-#### Simple inference
+##### Simple inference

 Request:

--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@ -114,7 +114,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
 | `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

 > **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
@ -124,6 +124,9 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 ```yaml
 uri: ${env.MILVUS_ENDPOINT}
 token: ${env.MILVUS_TOKEN}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db

 ```

--- a/docs/source/providers/vector_io/remote_pgvector.md
+++ b/docs/source/providers/vector_io/remote_pgvector.md
@ -40,6 +40,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 | `db` | `str \| None` | No | postgres |  |
 | `user` | `str \| None` | No | postgres |  |
 | `password` | `str \| None` | No | mysecretpassword |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

@ -49,6 +50,9 @@ port: ${env.PGVECTOR_PORT:=5432}
 db: ${env.PGVECTOR_DB}
 user: ${env.PGVECTOR_USER}
 password: ${env.PGVECTOR_PASSWORD}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db

 ```

--- a/docs/source/providers/vector_io/remote_weaviate.md
+++ b/docs/source/providers/vector_io/remote_weaviate.md
@ -36,7 +36,9 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
 ## Sample Configuration

 ```yaml
-{}
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db

 ```

--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -9,7 +9,8 @@ The `llama-stack-client` CLI allows you to query information about the distribut
 llama-stack-client
 Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...

-  Welcome to the LlamaStackClient CLI
+  Welcome to the llama-stack-client CLI - a command-line interface for
+  interacting with Llama Stack

 Options:
  --version        Show the version and exit.
@ -35,6 +36,7 @@ Commands:
 ```

 ### `llama-stack-client configure`
+Configure Llama Stack Client CLI.
 ```bash
 llama-stack-client configure
 > Enter the host name of the Llama Stack distribution server: localhost
@ -42,7 +44,24 @@ llama-stack-client configure
 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
 ```

+Optional arguments:
+- `--endpoint`: Llama Stack distribution endpoint
+- `--api-key`: Llama Stack distribution API key
+
+
+
+## `llama-stack-client inspect version`
+Inspect server configuration.
+```bash
+llama-stack-client inspect version
+```
+```bash
+VersionInfo(version='0.2.14')
+```
+
+
 ### `llama-stack-client providers list`
+Show available providers on distribution endpoint
 ```bash
 llama-stack-client providers list
 ```
@ -66,9 +85,74 @@ llama-stack-client providers list
 +-----------+----------------+-----------------+
 ```

+### `llama-stack-client providers inspect`
+Show specific provider configuration on distribution endpoint
+```bash
+llama-stack-client providers inspect <provider_id>
+```
+
+
+## Inference
+Inference (chat).
+
+
+### `llama-stack-client inference chat-completion`
+Show available inference chat completion endpoints on distribution endpoint
+```bash
+llama-stack-client inference chat-completion --message <message> [--stream] [--session] [--model-id]
+```
+```bash
+OpenAIChatCompletion(
+    id='chatcmpl-aacd11f3-8899-4ec5-ac5b-e655132f6891',
+    choices=[
+        OpenAIChatCompletionChoice(
+            finish_reason='stop',
+            index=0,
+            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                role='assistant',
+                content='The captain of the whaleship Pequod in Nathaniel Hawthorne\'s novel "Moby-Dick" is Captain
+Ahab. He\'s a vengeful and obsessive old sailor who\'s determined to hunt down and kill the white sperm whale
+Moby-Dick, whom he\'s lost his leg to in a previous encounter.',
+                name=None,
+                tool_calls=None,
+                refusal=None,
+                annotations=None,
+                audio=None,
+                function_call=None
+            ),
+            logprobs=None
+        )
+    ],
+    created=1752578797,
+    model='llama3.2:3b-instruct-fp16',
+    object='chat.completion',
+    service_tier=None,
+    system_fingerprint='fp_ollama',
+    usage={
+        'completion_tokens': 67,
+        'prompt_tokens': 33,
+        'total_tokens': 100,
+        'completion_tokens_details': None,
+        'prompt_tokens_details': None
+    }
+)
+```
+
+Required arguments:
+**Note:** At least one of these parameters is required for chat completion
+- `--message`: Message
+- `--session`: Start a Chat Session
+
+Optional arguments:
+- `--stream`: Stream
+- `--model-id`: Model ID
+
 ## Model Management
+Manage GenAI models.
+

 ### `llama-stack-client models list`
+Show available llama models at distribution endpoint
 ```bash
 llama-stack-client models list
 ```
@ -85,6 +169,7 @@ Total models: 1
 ```

 ### `llama-stack-client models get`
+Show details of a specific model at the distribution endpoint
 ```bash
 llama-stack-client models get Llama3.1-8B-Instruct
 ```
@ -105,69 +190,92 @@ Model RandomModel is not found at distribution endpoint host:port. Please ensure
 ```

 ### `llama-stack-client models register`
-
+Register a new model at distribution endpoint
 ```bash
-llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
+llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>] [--model-type <model_type>]
 ```

-### `llama-stack-client models update`
+Required arguments:
+- `MODEL_ID`: Model ID
+- `--provider-id`: Provider ID for the model

+Optional arguments:
+- `--provider-model-id`: Provider's model ID
+- `--metadata`: JSON metadata for the model
+- `--model-type`: Model type: `llm`, `embedding`
+
+
+### `llama-stack-client models unregister`
+Unregister a model from distribution endpoint
 ```bash
-llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
-```
-
-### `llama-stack-client models delete`
-
-```bash
-llama-stack-client models delete <model_id>
+llama-stack-client models unregister <model_id>
 ```

 ## Vector DB Management
+Manage vector databases.
+

 ### `llama-stack-client vector_dbs list`
+Show available vector dbs on distribution endpoint
 ```bash
 llama-stack-client vector_dbs list
 ```
 ```
-+--------------+----------------+---------------------+---------------+------------------------+
-| identifier   | provider_id    | provider_resource_id| vector_db_type| params                |
-+==============+================+=====================+===============+========================+
-| test_bank    | meta-reference | test_bank          | vector        | embedding_model: all-MiniLM-L6-v2
-                                                                      embedding_dimension: 384|
-+--------------+----------------+---------------------+---------------+------------------------+
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier               ┃ provider_id ┃ provider_resource_id     ┃ vector_db_type ┃ params                            ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ my_demo_vector_db        │ faiss       │ my_demo_vector_db        │                │ embedding_dimension: 384          │
+│                          │             │                          │                │ embedding_model: all-MiniLM-L6-v2 │
+│                          │             │                          │                │ type: vector_db                   │
+│                          │             │                          │                │                                   │
+└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
 ```

 ### `llama-stack-client vector_dbs register`
+Create a new vector db
 ```bash
 llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```

+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
 Optional arguments:
 - `--provider-id`: Provider ID for the vector db
 - `--provider-vector-db-id`: Provider's vector db ID
- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
+- `--embedding-model`: Embedding model to use. Default: `all-MiniLM-L6-v2`
 - `--embedding-dimension`: Dimension of embeddings. Default: 384

 ### `llama-stack-client vector_dbs unregister`
+Delete a vector db
 ```bash
 llama-stack-client vector_dbs unregister <vector-db-id>
 ```

+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+
 ## Shield Management
+Manage safety shield services.
 ### `llama-stack-client shields list`
+Show available safety shields on distribution endpoint
 ```bash
 llama-stack-client shields list
 ```

 ```
-+--------------+----------+----------------+-------------+
-| identifier   | params   | provider_id    | type        |
-+==============+==========+================+=============+
-| llama_guard  | {}       | meta-reference | llama_guard |
-+--------------+----------+----------------+-------------+
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier                       ┃ provider_alias                                                        ┃ params                ┃ provider_id                        ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ ollama                           │ ollama/llama-guard3:1b                                                │                       │ llama-guard                        │
+└──────────────────────────────────┴───────────────────────────────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────┘
 ```

 ### `llama-stack-client shields register`
+Register a new safety shield
 ```bash
 llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
 ```
@ -180,41 +288,29 @@ Optional arguments:
 - `--provider-shield-id`: Provider's shield ID
 - `--params`: JSON configuration parameters for the shield

-## Eval Task Management
-
-### `llama-stack-client benchmarks list`
-```bash
-llama-stack-client benchmarks list
-```
-
-### `llama-stack-client benchmarks register`
-```bash
-llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
-```
-
-Required arguments:
- `--eval-task-id`: ID of the eval task
- `--dataset-id`: ID of the dataset to evaluate
- `--scoring-functions`: One or more scoring functions to use for evaluation
-
-Optional arguments:
- `--provider-id`: Provider ID for the eval task
- `--provider-eval-task-id`: Provider's eval task ID
- `--metadata`: Metadata for the eval task in JSON format

 ## Eval execution
+Run evaluation tasks.
+
+
 ### `llama-stack-client eval run-benchmark`
+Run a evaluation benchmark task
 ```bash
-llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> --model-id <model-id> [--num-examples <num>] [--visualize] [--repeat-penalty <repeat-penalty>] [--top-p <top-p>] [--max-tokens <max-tokens>]
 ```

 Required arguments:
 - `--eval-task-config`: Path to the eval task config file in JSON format
 - `--output-dir`: Path to the directory where evaluation results will be saved
+- `--model-id`: model id to run the benchmark eval on

 Optional arguments:
 - `--num-examples`: Number of examples to evaluate (useful for debugging)
 - `--visualize`: If set, visualizes evaluation results after completion
+- `--repeat-penalty`: repeat-penalty in the sampling params to run generation
+- `--top-p`: top-p in the sampling params to run generation
+- `--max-tokens`: max-tokens in the sampling params to run generation
+- `--temperature`: temperature in the sampling params to run generation

 Example benchmark_config.json:
 ```json
@ -231,21 +327,55 @@ Example benchmark_config.json:
 ```

 ### `llama-stack-client eval run-scoring`
+Run scoring from application datasets
 ```bash
-llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+llama-stack-client eval run-scoring <eval-task-id> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

 Required arguments:
- `--eval-task-config`: Path to the eval task config file in JSON format
 - `--output-dir`: Path to the directory where scoring results will be saved

 Optional arguments:
 - `--num-examples`: Number of examples to evaluate (useful for debugging)
 - `--visualize`: If set, visualizes scoring results after completion
+- `--scoring-params-config`: Path to the scoring params config file in JSON format
+- `--dataset-id`: Pre-registered dataset_id to score (from llama-stack-client datasets list)
+- `--dataset-path`: Path to the dataset file to score
+
+
+## Eval Tasks
+Manage evaluation tasks.
+
+### `llama-stack-client eval_tasks list`
+Show available eval tasks on distribution endpoint
+```bash
+llama-stack-client eval_tasks list
+```
+
+
+### `llama-stack-client eval_tasks register`
+Register a new eval task
+```bash
+llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <scoring-functions> [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
+```
+
+
+Required arguments:
+- `--eval-task-id`: ID of the eval task
+- `--dataset-id`: ID of the dataset to evaluate
+- `--scoring-functions`: Scoring functions to use for evaluation
+
+Optional arguments:
+- `--provider-id`: Provider ID for the eval task
+- `--provider-eval-task-id`: Provider's eval task ID
+

 ## Tool Group Management
+Manage available tool groups.
+

 ### `llama-stack-client toolgroups list`
+Show available llama toolgroups at distribution endpoint
 ```bash
 llama-stack-client toolgroups list
 ```
@ -260,17 +390,28 @@ llama-stack-client toolgroups list
 ```

 ### `llama-stack-client toolgroups get`
+Get available llama toolgroups by id
 ```bash
 llama-stack-client toolgroups get <toolgroup_id>
 ```

 Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.

+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
+
 ### `llama-stack-client toolgroups register`
+Register a new toolgroup at distribution endpoint
 ```bash
 llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
 ```

+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
 Optional arguments:
 - `--provider-id`: Provider ID for the toolgroup
 - `--provider-toolgroup-id`: Provider's toolgroup ID
@ -278,6 +419,172 @@ Optional arguments:
 - `--args`: JSON arguments for the toolgroup

 ### `llama-stack-client toolgroups unregister`
+Unregister a toolgroup from distribution endpoint
 ```bash
 llama-stack-client toolgroups unregister <toolgroup_id>
 ```
+
+
+Required arguments:
+- `TOOLGROUP_ID`: ID of the tool group
+
+
+## Datasets Management
+Manage datasets.
+
+
+### `llama-stack-client datasets list`
+Show available datasets on distribution endpoint
+```bash
+llama-stack-client datasets list
+```
+
+
+### `llama-stack-client datasets register`
+```bash
+llama-stack-client datasets register --dataset_id <dataset_id> --purpose <purpose> [--url <url] [--dataset-path <dataset-path>] [--dataset-id <dataset-id>] [--metadata <metadata>]
+```
+
+Required arguments:
+- `--dataset_id`: Id of the dataset
+- `--purpose`: Purpose of the dataset
+
+Optional arguments:
+- `--metadata`: Metadata of the dataset
+- `--url`: URL of the dataset
+- `--dataset-path`: Local file path to the dataset. If specified, upload dataset via URL
+
+
+### `llama-stack-client datasets unregister`
+Remove a dataset
+```bash
+llama-stack-client datasets unregister <dataset-id>
+```
+
+
+Required arguments:
+- `DATASET_ID`: Id of the dataset
+
+
+## Scoring Functions Management
+Manage scoring functions.
+
+### `llama-stack-client scoring_functions list`
+Show available scoring functions on distribution endpoint
+```bash
+llama-stack-client scoring_functions list
+```
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
+┃ identifier                                 ┃ provider_id  ┃ description                                                   ┃ type             ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
+│ basic::bfcl                                │ basic        │ BFCL complex scoring                                          │ scoring_function │
+│ basic::docvqa                              │ basic        │ DocVQA Visual Question & Answer scoring function              │ scoring_function │
+│ basic::equality                            │ basic        │ Returns 1.0 if the input is equal to the target, 0.0          │ scoring_function │
+│                                            │              │ otherwise.                                                    │                  │
+└────────────────────────────────────────────┴──────────────┴───────────────────────────────────────────────────────────────┴──────────────────┘
+```
+
+
+### `llama-stack-client scoring_functions register`
+Register a new scoring function
+```bash
+llama-stack-client scoring_functions register --scoring-fn-id <scoring-fn-id> --description <description> --return-type <return-type> [--provider-id <provider-id>] [--provider-scoring-fn-id <provider-scoring-fn-id>] [--params <params>]
+```
+
+
+Required arguments:
+- `--scoring-fn-id`: Id of the scoring function
+- `--description`: Description of the scoring function
+- `--return-type`: Return type of the scoring function
+
+Optional arguments:
+- `--provider-id`: Provider ID for the scoring function
+- `--provider-scoring-fn-id`: Provider's scoring function ID
+- `--params`: Parameters for the scoring function in JSON format
+
+
+## Post Training Management
+Post-training.
+
+### `llama-stack-client post_training list`
+Show the list of available post training jobs
+```bash
+llama-stack-client post_training list
+```
+```bash
+["job-1", "job-2", "job-3"]
+```
+
+
+### `llama-stack-client post_training artifacts`
+Get the training artifacts of a specific post training job
+```bash
+llama-stack-client post_training artifacts --job-uuid <job-uuid>
+```
+```bash
+JobArtifactsResponse(checkpoints=[], job_uuid='job-1')
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+
+
+### `llama-stack-client post_training supervised_fine_tune`
+Kick off a supervised fine tune job
+```bash
+llama-stack-client post_training supervised_fine_tune --job-uuid <job-uuid> --model <model> --algorithm-config <algorithm-config> --training-config <training-config> [--checkpoint-dir <checkpoint-dir>]
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+- `--model`: Model ID
+- `--algorithm-config`: Algorithm Config
+- `--training-config`: Training Config
+
+Optional arguments:
+- `--checkpoint-dir`: Checkpoint Config
+
+
+### `llama-stack-client post_training status`
+Show the status of a specific post training job
+```bash
+llama-stack-client post_training status --job-uuid <job-uuid>
+```
+```bash
+JobStatusResponse(
+    checkpoints=[],
+    job_uuid='job-1',
+    status='completed',
+    completed_at="",
+    resources_allocated="",
+    scheduled_at="",
+    started_at=""
+)
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
+
+
+### `llama-stack-client post_training cancel`
+Cancel the training job
+```bash
+llama-stack-client post_training cancel --job-uuid <job-uuid>
+```
+```bash
+# This functionality is not yet implemented for llama-stack-client
+╭────────────────────────────────────────────────────────────╮
+│ Failed to post_training cancel_training_job                │
+│                                                            │
+│ Error Type: InternalServerError                            │
+│ Details: Error code: 501 - {'detail': 'Not implemented: '} │
+╰────────────────────────────────────────────────────────────╯
+```
+
+
+Required arguments:
+- `--job-uuid`: Job UUID
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -819,6 +819,12 @@ class OpenAIEmbeddingsResponse(BaseModel):
 class ModelStore(Protocol):
    async def get_model(self, identifier: str) -> Model: ...

+    async def update_registered_llm_models(
+        self,
+        provider_id: str,
+        models: list[Model],
+    ) -> None: ...
+

 class TextTruncation(Enum):
    """Config for how to truncate text for embedding when text is longer than the model's max sequence length. Start and End semantics depend on whether the language is left-to-right or right-to-left.
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -7,7 +7,7 @@
 from enum import StrEnum
 from typing import Any, Literal, Protocol, runtime_checkable

-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator

 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -36,13 +36,21 @@ class Model(CommonModelFields, Resource):
        return self.identifier

    @property
-    def provider_model_id(self) -> str | None:
+    def provider_model_id(self) -> str:
+        assert self.provider_resource_id is not None, "Provider resource ID must be set"
        return self.provider_resource_id

    model_config = ConfigDict(protected_namespaces=())

    model_type: ModelType = Field(default=ModelType.llm)

+    @field_validator("provider_resource_id")
+    @classmethod
+    def validate_provider_resource_id(cls, v):
+        if v is None:
+            raise ValueError("provider_resource_id cannot be None")
+        return v
+

 class ModelInput(CommonModelFields):
    model_id: str
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -104,12 +104,18 @@ class RLHFAlgorithm(Enum):
    dpo = "dpo"


+@json_schema_type
+class DPOLossType(Enum):
+    sigmoid = "sigmoid"
+    hinge = "hinge"
+    ipo = "ipo"
+    kto_pair = "kto_pair"
+
+
@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
+    beta: float
+    loss_type: DPOLossType = DPOLossType.sigmoid


@json_schema_type
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -19,6 +19,7 @@ class VectorDB(Resource):

    embedding_model: str
    embedding_dimension: int
+    vector_db_name: str | None = None

    @property
    def vector_db_id(self) -> str:
@ -70,6 +71,7 @@ class VectorDBs(Protocol):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> VectorDB:
        """Register a vector database.
@ -78,6 +80,7 @@ class VectorDBs(Protocol):
        :param embedding_model: The embedding model to use.
        :param embedding_dimension: The dimension of the embedding model.
        :param provider_id: The identifier of the provider.
+        :param vector_db_name: The name of the vector database.
        :param provider_vector_db_id: The identifier of the vector database in the provider.
        :returns: A VectorDB.
        """
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -346,7 +346,6 @@ class VectorIO(Protocol):
        embedding_model: str | None = None,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        """Creates a vector store.

@ -358,7 +357,6 @@ class VectorIO(Protocol):
        :param embedding_model: The embedding model to use for this vector store.
        :param embedding_dimension: The dimension of the embedding vectors (default: 384).
        :param provider_id: The ID of the provider to use for this vector store.
-        :param provider_vector_db_id: The provider-specific vector database ID.
        :returns: A VectorStoreObject representing the created vector store.
        """
        ...
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -47,8 +47,7 @@ class StackRun(Subcommand):
        self.parser.add_argument(
            "--image-name",
            type=str,
-            default=os.environ.get("CONDA_DEFAULT_ENV"),
-            help="Name of the image to run. Defaults to the current environment",
+            help="Name of the image to run.",
        )
        self.parser.add_argument(
            "--env",
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@ -17,7 +17,7 @@ from llama_stack.distribution.distribution import (
    builtin_automatically_routed_apis,
    get_provider_registry,
 )
-from llama_stack.distribution.stack import replace_env_vars
+from llama_stack.distribution.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
@ -164,7 +164,8 @@ def upgrade_from_routing_table(
 def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
    version = config_dict.get("version", None)
    if version == LLAMA_STACK_RUN_CONFIG_VERSION:
-        return StackRunConfig(**replace_env_vars(config_dict))
+        processed_config_dict = replace_env_vars(config_dict)
+        return StackRunConfig(**cast_image_name_to_string(processed_config_dict))

    if "routing_table" in config_dict:
        logger.info("Upgrading config...")
@ -175,4 +176,5 @@ def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfi
    if not config_dict.get("external_providers_dir", None):
        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR

-    return StackRunConfig(**replace_env_vars(config_dict))
+    processed_config_dict = replace_env_vars(config_dict)
+    return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -12,11 +12,13 @@ import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
+from io import BytesIO
 from pathlib import Path
 from typing import Any, TypeVar, Union, get_args, get_origin

 import httpx
 import yaml
+from fastapi import Response as FastAPIResponse
 from llama_stack_client import (
    NOT_GIVEN,
    APIResponse,
@ -112,6 +114,27 @@ def convert_to_pydantic(annotation: Any, value: Any) -> Any:
        raise ValueError(f"Failed to convert parameter {value} into {annotation}: {e}") from e


+class LibraryClientUploadFile:
+    """LibraryClient UploadFile object that mimics FastAPI's UploadFile interface."""
+
+    def __init__(self, filename: str, content: bytes):
+        self.filename = filename
+        self.content = content
+        self.content_type = "application/octet-stream"
+
+    async def read(self) -> bytes:
+        return self.content
+
+
+class LibraryClientHttpxResponse:
+    """LibraryClient httpx Response object for FastAPI Response conversion."""
+
+    def __init__(self, response):
+        self.content = response.body if isinstance(response.body, bytes) else response.body.encode()
+        self.status_code = response.status_code
+        self.headers = response.headers
+
+
 class LlamaStackAsLibraryClient(LlamaStackClient):
    def __init__(
        self,
@ -128,6 +151,8 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
        self.skip_logger_removal = skip_logger_removal
        self.provider_data = provider_data

+        self.loop = asyncio.new_event_loop()
+
    def initialize(self):
        if in_notebook():
            import nest_asyncio
@ -136,7 +161,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            if not self.skip_logger_removal:
                self._remove_root_logger_handlers()

-        return asyncio.run(self.async_client.initialize())
+        return self.loop.run_until_complete(self.async_client.initialize())

    def _remove_root_logger_handlers(self):
        """
@ -149,10 +174,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")

    def request(self, *args, **kwargs):
-        # NOTE: We are using AsyncLlamaStackClient under the hood
-        # A new event loop is needed to convert the AsyncStream
-        # from async client into SyncStream return type for streaming
-        loop = asyncio.new_event_loop()
+        loop = self.loop
        asyncio.set_event_loop(loop)

        if kwargs.get("stream"):
@ -169,7 +191,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                    pending = asyncio.all_tasks(loop)
                    if pending:
                        loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                    loop.close()

            return sync_generator()
        else:
@ -179,7 +200,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                pending = asyncio.all_tasks(loop)
                if pending:
                    loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                loop.close()
            return result


@ -295,6 +315,31 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                )
            return response

+    def _handle_file_uploads(self, options: Any, body: dict) -> tuple[dict, list[str]]:
+        """Handle file uploads from OpenAI client and add them to the request body."""
+        if not (hasattr(options, "files") and options.files):
+            return body, []
+
+        if not isinstance(options.files, list):
+            return body, []
+
+        field_names = []
+        for file_tuple in options.files:
+            if not (isinstance(file_tuple, tuple) and len(file_tuple) >= 2):
+                continue
+
+            field_name = file_tuple[0]
+            file_object = file_tuple[1]
+
+            if isinstance(file_object, BytesIO):
+                file_object.seek(0)
+                file_content = file_object.read()
+                filename = getattr(file_object, "name", "uploaded_file")
+                field_names.append(field_name)
+                body[field_name] = LibraryClientUploadFile(filename, file_content)
+
+        return body, field_names
+
    async def _call_non_streaming(
        self,
        *,
@ -310,15 +355,23 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        matched_func, path_params, route = find_matching_route(options.method, path, self.route_impls)
        body |= path_params
-        body = self._convert_body(path, options.method, body)
+
+        body, field_names = self._handle_file_uploads(options, body)
+
+        body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
        await start_trace(route, {"__location__": "library_client"})
        try:
            result = await matched_func(**body)
        finally:
            await end_trace()

+        # Handle FastAPI Response objects (e.g., from file content retrieval)
+        if isinstance(result, FastAPIResponse):
+            return LibraryClientHttpxResponse(result)
+
        json_content = json.dumps(convert_pydantic_to_json_value(result))

+        filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
        mock_response = httpx.Response(
            status_code=httpx.codes.OK,
            content=json_content.encode("utf-8"),
@ -330,7 +383,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                url=options.url,
                params=options.params,
                headers=options.headers or {},
-                json=convert_pydantic_to_json_value(body),
+                json=convert_pydantic_to_json_value(filtered_body),
            ),
        )
        response = APIResponse(
@ -404,13 +457,17 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        )
        return await response.parse()

-    def _convert_body(self, path: str, method: str, body: dict | None = None) -> dict:
+    def _convert_body(
+        self, path: str, method: str, body: dict | None = None, exclude_params: set[str] | None = None
+    ) -> dict:
        if not body:
            return {}

        if self.route_impls is None:
            raise ValueError("Client not initialized")

+        exclude_params = exclude_params or set()
+
        func, _, _ = find_matching_route(method, path, self.route_impls)
        sig = inspect.signature(func)

@ -422,6 +479,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        for param_name, param in sig.parameters.items():
            if param_name in body:
                value = body.get(param_name)
-                converted_body[param_name] = convert_to_pydantic(param.annotation, value)
+                if param_name in exclude_params:
+                    converted_body[param_name] = value
+                else:
+                    converted_body[param_name] = convert_to_pydantic(param.annotation, value)

        return converted_body
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -200,7 +200,7 @@ def validate_and_prepare_providers(
        specs = {}
        for provider in providers:
            if not provider.provider_id or provider.provider_id == "__disabled__":
-                logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
+                logger.debug(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
                continue

            validate_provider(provider, api, provider_registry)
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import asyncio
+import uuid
 from typing import Any

 from llama_stack.apis.common.content_types import (
@ -81,6 +82,7 @@ class VectorIORouter(VectorIO):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> None:
        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
@ -89,6 +91,7 @@ class VectorIORouter(VectorIO):
            embedding_model,
            embedding_dimension,
            provider_id,
+            vector_db_name,
            provider_vector_db_id,
        )

@ -123,7 +126,6 @@ class VectorIORouter(VectorIO):
        embedding_model: str | None = None,
        embedding_dimension: int | None = None,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_create_vector_store: name={name}, provider_id={provider_id}")

@ -135,17 +137,17 @@ class VectorIORouter(VectorIO):
            embedding_model, embedding_dimension = embedding_model_info
            logger.info(f"No embedding model specified, using first available: {embedding_model}")

-        vector_db_id = name
+        vector_db_id = f"vs_{uuid.uuid4()}"
        registered_vector_db = await self.routing_table.register_vector_db(
-            vector_db_id,
-            embedding_model,
-            embedding_dimension,
-            provider_id,
-            provider_vector_db_id,
+            vector_db_id=vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=embedding_dimension,
+            provider_id=provider_id,
+            provider_vector_db_id=vector_db_id,
+            vector_db_name=name,
        )
-
        return await self.routing_table.get_provider_impl(registered_vector_db.identifier).openai_create_vector_store(
-            vector_db_id,
+            name=name,
            file_ids=file_ids,
            expires_after=expires_after,
            chunking_strategy=chunking_strategy,
--- a/llama_stack/distribution/routing_tables/models.py
+++ b/llama_stack/distribution/routing_tables/models.py
@ -80,3 +80,38 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
        if existing_model is None:
            raise ValueError(f"Model {model_id} not found")
        await self.unregister_object(existing_model)
+
+    async def update_registered_llm_models(
+        self,
+        provider_id: str,
+        models: list[Model],
+    ) -> None:
+        existing_models = await self.get_all_with_type("model")
+
+        # we may have an alias for the model registered by the user (or during initialization
+        # from run.yaml) that we need to keep track of
+        model_ids = {}
+        for model in existing_models:
+            # we leave embeddings models alone because often we don't get metadata
+            # (embedding dimension, etc.) from the provider
+            if model.provider_id == provider_id and model.model_type == ModelType.llm:
+                model_ids[model.provider_resource_id] = model.identifier
+                logger.debug(f"unregistering model {model.identifier}")
+                await self.unregister_object(model)
+
+        for model in models:
+            if model.model_type != ModelType.llm:
+                continue
+            if model.provider_resource_id in model_ids:
+                model.identifier = model_ids[model.provider_resource_id]
+
+            logger.debug(f"registering model {model.identifier} ({model.provider_resource_id})")
+            await self.register_object(
+                ModelWithOwner(
+                    identifier=model.identifier,
+                    provider_resource_id=model.provider_resource_id,
+                    provider_id=provider_id,
+                    metadata=model.metadata,
+                    model_type=model.model_type,
+                )
+            )
--- a/llama_stack/distribution/routing_tables/vector_dbs.py
+++ b/llama_stack/distribution/routing_tables/vector_dbs.py
@ -36,6 +36,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
        provider_vector_db_id: str | None = None,
+        vector_db_name: str | None = None,
    ) -> VectorDB:
        if provider_vector_db_id is None:
            provider_vector_db_id = vector_db_id
@ -62,6 +63,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            "provider_resource_id": provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
+            "vector_db_name": vector_db_name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -47,6 +47,7 @@ from llama_stack.distribution.server.routes import (
    initialize_route_impls,
 )
 from llama_stack.distribution.stack import (
+    cast_image_name_to_string,
    construct_stack,
    replace_env_vars,
    validate_env_pair,
@ -439,14 +440,12 @@ def main(args: argparse.Namespace | None = None):
                    logger.error(f"Error: {str(e)}")
                    sys.exit(1)
        config = replace_env_vars(config_contents)
-        config = StackRunConfig(**config)
+        config = StackRunConfig(**cast_image_name_to_string(config))

    # now that the logger is initialized, print the line about which type of config we are using.
    logger.info(log_line)

-    logger.info("Run configuration:")
-    safe_config = redact_sensitive_fields(config.model_dump(mode="json"))
-    logger.info(yaml.dump(safe_config, indent=2))
+    _log_run_config(run_config=config)

    app = FastAPI(
        lifespan=lifespan,
@ -454,6 +453,7 @@ def main(args: argparse.Namespace | None = None):
        redoc_url="/redoc",
        openapi_url="/openapi.json",
    )
+
    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
        app.add_middleware(ClientVersionMiddleware)

@ -492,7 +492,13 @@ def main(args: argparse.Namespace | None = None):
        )

    try:
-        impls = asyncio.run(construct_stack(config))
+        # Create and set the event loop that will be used for both construction and server runtime
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        # Construct the stack in the persistent event loop
+        impls = loop.run_until_complete(construct_stack(config))
+
    except InvalidProviderError as e:
        logger.error(f"Error: {str(e)}")
        sys.exit(1)
@ -590,7 +596,16 @@ def main(args: argparse.Namespace | None = None):
    if ssl_config:
        uvicorn_config.update(ssl_config)

-    uvicorn.run(**uvicorn_config)
+    # Run uvicorn in the existing event loop to preserve background tasks
+    loop.run_until_complete(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
+
+
+def _log_run_config(run_config: StackRunConfig):
+    """Logs the run config with redacted fields and disabled providers removed."""
+    logger.info("Run configuration:")
+    safe_config = redact_sensitive_fields(run_config.model_dump(mode="json"))
+    clean_config = remove_disabled_providers(safe_config)
+    logger.info(yaml.dump(clean_config, indent=2))


 def extract_path_params(route: str) -> list[str]:
@ -601,5 +616,20 @@ def extract_path_params(route: str) -> list[str]:
    return params


+def remove_disabled_providers(obj):
+    if isinstance(obj, dict):
+        if (
+            obj.get("provider_id") == "__disabled__"
+            or obj.get("shield_id") == "__disabled__"
+            or obj.get("provider_model_id") == "__disabled__"
+        ):
+            return None
+        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
+    elif isinstance(obj, list):
+        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
+    else:
+        return obj
+
+
 if __name__ == "__main__":
    main()
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -172,7 +172,6 @@ def replace_env_vars(config: Any, path: str = "") -> Any:
                            # Create a copy with resolved provider_id but original config
                            disabled_provider = v.copy()
                            disabled_provider["provider_id"] = resolved_provider_id
-                            result.append(disabled_provider)
                            continue
                    except EnvVarError:
                        # If we can't resolve the provider_id, continue with normal processing
@ -267,6 +266,13 @@ def _convert_string_to_proper_type(value: str) -> Any:
    return value


+def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that any value for a key 'image_name' in a config_dict is a string"""
+    if "image_name" in config_dict and config_dict["image_name"] is not None:
+        config_dict["image_name"] = str(config_dict["image_name"])
+    return config_dict
+
+
 def validate_env_pair(env_pair: str) -> tuple[str, str]:
    """Validate and split an environment variable key-value pair."""
    try:
--- a/llama_stack/models/llama/llama3/chat_format.py
+++ b/llama_stack/models/llama/llama3/chat_format.py
@ -8,6 +8,7 @@ import io
 import json
 import uuid
 from dataclasses import dataclass
+from typing import Any

 from PIL import Image as PIL_Image

@ -184,16 +185,26 @@ class ChatFormat:
            content = content[: -len("<|eom_id|>")]
            stop_reason = StopReason.end_of_message

-        tool_name = None
-        tool_arguments = {}
+        tool_name: str | BuiltinTool | None = None
+        tool_arguments: dict[str, Any] = {}

        custom_tool_info = ToolUtils.maybe_extract_custom_tool_call(content)
        if custom_tool_info is not None:
-            tool_name, tool_arguments = custom_tool_info
+            # Type guard: ensure custom_tool_info is a tuple of correct types
+            if isinstance(custom_tool_info, tuple) and len(custom_tool_info) == 2:
+                extracted_tool_name, extracted_tool_arguments = custom_tool_info
+                # Handle both dict and str return types from the function
+                if isinstance(extracted_tool_arguments, dict):
+                    tool_name, tool_arguments = extracted_tool_name, extracted_tool_arguments
+                else:
+                    # If it's a string, treat it as a query parameter
+                    tool_name, tool_arguments = extracted_tool_name, {"query": extracted_tool_arguments}
+            else:
+                tool_name, tool_arguments = None, {}
            # Sometimes when agent has custom tools alongside builin tools
            # Agent responds for builtin tool calls in the format of the custom tools
            # This code tries to handle that case
-            if tool_name in BuiltinTool.__members__:
+            if tool_name is not None and tool_name in BuiltinTool.__members__:
                tool_name = BuiltinTool[tool_name]
                if isinstance(tool_arguments, dict):
                    tool_arguments = {
--- a/llama_stack/providers/inline/files/localfs/init.py
+++ b/llama_stack/providers/inline/files/localfs/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.distribution.datatypes import Api
+from llama_stack.distribution.datatypes import AccessRule, Api

 from .config import LocalfsFilesImplConfig
 from .files import LocalfsFilesImpl
@ -14,7 +14,7 @@ from .files import LocalfsFilesImpl
 __all__ = ["LocalfsFilesImpl", "LocalfsFilesImplConfig"]


-async def get_provider_impl(config: LocalfsFilesImplConfig, deps: dict[Api, Any]):
-    impl = LocalfsFilesImpl(config)
+async def get_provider_impl(config: LocalfsFilesImplConfig, deps: dict[Api, Any], policy: list[AccessRule]):
+    impl = LocalfsFilesImpl(config, policy)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@ -19,16 +19,19 @@ from llama_stack.apis.files import (
    OpenAIFileObject,
    OpenAIFilePurpose,
 )
+from llama_stack.distribution.datatypes import AccessRule
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl

 from .config import LocalfsFilesImplConfig


 class LocalfsFilesImpl(Files):
-    def __init__(self, config: LocalfsFilesImplConfig) -> None:
+    def __init__(self, config: LocalfsFilesImplConfig, policy: list[AccessRule]) -> None:
        self.config = config
-        self.sql_store: SqlStore | None = None
+        self.policy = policy
+        self.sql_store: AuthorizedSqlStore | None = None

    async def initialize(self) -> None:
        """Initialize the files provider by setting up storage directory and metadata database."""
@ -37,7 +40,7 @@ class LocalfsFilesImpl(Files):
        storage_path.mkdir(parents=True, exist_ok=True)

        # Initialize SQL store for metadata
-        self.sql_store = sqlstore_impl(self.config.metadata_store)
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.config.metadata_store))
        await self.sql_store.create_table(
            "openai_files",
            {
@ -51,6 +54,9 @@ class LocalfsFilesImpl(Files):
            },
        )

+    async def shutdown(self) -> None:
+        pass
+
    def _generate_file_id(self) -> str:
        """Generate a unique file ID for OpenAI API."""
        return f"file-{uuid.uuid4().hex}"
@ -123,6 +129,7 @@ class LocalfsFilesImpl(Files):

        paginated_result = await self.sql_store.fetch_all(
            table="openai_files",
+            policy=self.policy,
            where=where_conditions if where_conditions else None,
            order_by=[("created_at", order.value)],
            cursor=("id", after) if after else None,
@ -153,7 +160,7 @@ class LocalfsFilesImpl(Files):
        if not self.sql_store:
            raise RuntimeError("Files provider not initialized")

-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
        if not row:
            raise ValueError(f"File with id {file_id} not found")

@ -171,7 +178,7 @@ class LocalfsFilesImpl(Files):
        if not self.sql_store:
            raise RuntimeError("Files provider not initialized")

-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
        if not row:
            raise ValueError(f"File with id {file_id} not found")

@ -194,7 +201,7 @@ class LocalfsFilesImpl(Files):
            raise RuntimeError("Files provider not initialized")

        # Get file metadata
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
        if not row:
            raise ValueError(f"File with id {file_id} not found")

--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -98,7 +98,7 @@ class ProcessingMessageWrapper(BaseModel):


 def mp_rank_0() -> bool:
-    return get_model_parallel_rank() == 0
+    return bool(get_model_parallel_rank() == 0)


 def encode_msg(msg: ProcessingMessage) -> bytes:
@ -125,7 +125,7 @@ def retrieve_requests(reply_socket_url: str):
        reply_socket.send_multipart([client_id, encode_msg(obj)])

    while True:
-        tasks = [None]
+        tasks: list[ProcessingMessage | None] = [None]
        if mp_rank_0():
            client_id, maybe_task_json = maybe_get_work(reply_socket)
            if maybe_task_json is not None:
@ -152,7 +152,7 @@ def retrieve_requests(reply_socket_url: str):
                    break

                for obj in out:
-                    updates = [None]
+                    updates: list[ProcessingMessage | None] = [None]
                    if mp_rank_0():
                        _, update_json = maybe_get_work(reply_socket)
                        update = maybe_parse_message(update_json)
--- a/llama_stack/providers/inline/inference/vllm/init.py
+++ b/llama_stack/providers/inline/inference/vllm/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import VLLMConfig
-
-
-async def get_provider_impl(config: VLLMConfig, _deps: dict[str, Any]):
-    from .vllm import VLLMInferenceImpl
-
-    impl = VLLMInferenceImpl(config)
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -1,53 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class VLLMConfig(BaseModel):
-    """Configuration for the vLLM inference provider.
-
-    Note that the model name is no longer part of this static configuration.
-    You can bind an instance of this provider to a specific model with the
-    ``models.register()`` API call."""
-
-    tensor_parallel_size: int = Field(
-        default=1,
-        description="Number of tensor parallel replicas (number of GPUs to use).",
-    )
-    max_tokens: int = Field(
-        default=4096,
-        description="Maximum number of tokens to generate.",
-    )
-    max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
-    max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
-    enforce_eager: bool = Field(
-        default=False,
-        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
-    )
-    gpu_memory_utilization: float = Field(
-        default=0.3,
-        description=(
-            "How much GPU memory will be allocated when this provider has finished "
-            "loading, including memory that was already allocated before loading."
-        ),
-    )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:=1}",
-            "max_tokens": "${env.MAX_TOKENS:=4096}",
-            "max_model_len": "${env.MAX_MODEL_LEN:=4096}",
-            "max_num_seqs": "${env.MAX_NUM_SEQS:=4}",
-            "enforce_eager": "${env.ENFORCE_EAGER:=False}",
-            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:=0.3}",
-        }
--- a/llama_stack/providers/inline/inference/vllm/openai_utils.py
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@ -1,170 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import vllm
-
-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    GrammarResponseFormat,
-    JsonSchemaResponseFormat,
-    Message,
-    ToolChoice,
-    ToolDefinition,
-    UserMessage,
-)
-from llama_stack.models.llama.datatypes import BuiltinTool
-from llama_stack.providers.utils.inference.openai_compat import (
-    convert_message_to_openai_dict,
-    get_sampling_options,
-)
-
-###############################################################################
-# This file contains OpenAI compatibility code that is currently only used
-# by the inline vLLM connector. Some or all of this code may be moved to a
-# central location at a later date.
-
-
-def _merge_context_into_content(message: Message) -> Message:  # type: ignore
-    """
-    Merge the ``context`` field of a Llama Stack ``Message`` object into
-    the content field for compabilitiy with OpenAI-style APIs.
-
-    Generates a content string that emulates the current behavior
-    of ``llama_models.llama3.api.chat_format.encode_message()``.
-
-    :param message: Message that may include ``context`` field
-
-    :returns: A version of ``message`` with any context merged into the
-     ``content`` field.
-    """
-    if not isinstance(message, UserMessage):  # Separate type check for linter
-        return message
-    if message.context is None:
-        return message
-    return UserMessage(
-        role=message.role,
-        # Emumate llama_models.llama3.api.chat_format.encode_message()
-        content=message.content + "\n\n" + message.context,
-        context=None,
-    )
-
-
-def _llama_stack_tools_to_openai_tools(
-    tools: list[ToolDefinition] | None = None,
-) -> list[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
-    """
-    Convert the list of available tools from Llama Stack's format to vLLM's
-    version of OpenAI's format.
-    """
-    if tools is None:
-        return []
-
-    result = []
-    for t in tools:
-        if isinstance(t.tool_name, BuiltinTool):
-            raise NotImplementedError("Built-in tools not yet implemented")
-        if t.parameters is None:
-            parameters = None
-        else:  # if t.parameters is not None
-            # Convert the "required" flags to a list of required params
-            required_params = [k for k, v in t.parameters.items() if v.required]
-            parameters = {
-                "type": "object",  # Mystery value that shows up in OpenAI docs
-                "properties": {
-                    k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items()
-                },
-                "required": required_params,
-            }
-
-        function_def = vllm.entrypoints.openai.protocol.FunctionDefinition(
-            name=t.tool_name, description=t.description, parameters=parameters
-        )
-
-        # Every tool definition is double-boxed in a ChatCompletionToolsParam
-        result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def))
-    return result
-
-
-async def llama_stack_chat_completion_to_openai_chat_completion_dict(
-    request: ChatCompletionRequest,
-) -> dict:
-    """
-    Convert a chat completion request in Llama Stack format into an
-    equivalent set of arguments to pass to an OpenAI-compatible
-    chat completions API.
-
-    :param request: Bundled request parameters in Llama Stack format.
-
-    :returns: Dictionary of key-value pairs to use as an initializer
-     for a dataclass or to be converted directly to JSON and sent
-     over the wire.
-    """
-
-    converted_messages = [
-        # This mystery async call makes the parent function also be async
-        await convert_message_to_openai_dict(_merge_context_into_content(m), download=True)
-        for m in request.messages
-    ]
-    converted_tools = _llama_stack_tools_to_openai_tools(request.tools)
-
-    # Llama will try to use built-in tools with no tool catalog, so don't enable
-    # tool choice unless at least one tool is enabled.
-    converted_tool_choice = "none"
-    if (
-        request.tool_config is not None
-        and request.tool_config.tool_choice == ToolChoice.auto
-        and request.tools is not None
-        and len(request.tools) > 0
-    ):
-        converted_tool_choice = "auto"
-
-    # TODO: Figure out what to do with the tool_prompt_format argument.
-    #  Other connectors appear to drop it quietly.
-
-    # Use Llama Stack shared code to translate sampling parameters.
-    sampling_options = get_sampling_options(request.sampling_params)
-
-    # get_sampling_options() translates repetition penalties to an option that
-    # OpenAI's APIs don't know about.
-    # vLLM's OpenAI-compatible API also handles repetition penalties wrong.
-    # For now, translate repetition penalties into a format that vLLM's broken
-    # API will handle correctly. Two wrongs make a right...
-    if "repeat_penalty" in sampling_options:
-        del sampling_options["repeat_penalty"]
-    if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0:
-        sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty
-
-    # Convert a single response format into four different parameters, per
-    # the OpenAI spec
-    guided_decoding_options = dict()
-    if request.response_format is None:
-        # Use defaults
-        pass
-    elif isinstance(request.response_format, JsonSchemaResponseFormat):
-        guided_decoding_options["guided_json"] = request.response_format.json_schema
-    elif isinstance(request.response_format, GrammarResponseFormat):
-        guided_decoding_options["guided_grammar"] = request.response_format.bnf
-    else:
-        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'")
-
-    logprob_options = dict()
-    if request.logprobs is not None:
-        logprob_options["logprobs"] = request.logprobs.top_k
-
-    # Marshall together all the arguments for a ChatCompletionRequest
-    request_options = {
-        "model": request.model,
-        "messages": converted_messages,
-        "tools": converted_tools,
-        "tool_choice": converted_tool_choice,
-        "stream": request.stream,
-        **sampling_options,
-        **guided_decoding_options,
-        **logprob_options,
-    }
-
-    return request_options
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -1,811 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import re
-import uuid
-from collections.abc import AsyncGenerator, AsyncIterator
-
-# These vLLM modules contain names that overlap with Llama Stack names, so we import
-# fully-qualified names
-import vllm.entrypoints.openai.protocol
-import vllm.sampling_params
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-
-from llama_stack.apis.common.content_types import (
-    InterleavedContent,
-    InterleavedContentItem,
-    TextDelta,
-    ToolCallDelta,
-)
-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    ChatCompletionResponseEvent,
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-    CompletionMessage,
-    CompletionResponse,
-    CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
-    GrammarResponseFormat,
-    Inference,
-    JsonSchemaResponseFormat,
-    LogProbConfig,
-    Message,
-    OpenAIEmbeddingsResponse,
-    ResponseFormat,
-    SamplingParams,
-    TextTruncation,
-    TokenLogProbs,
-    ToolChoice,
-    ToolConfig,
-    TopKSamplingStrategy,
-    TopPSamplingStrategy,
-)
-from llama_stack.apis.models import Model
-from llama_stack.log import get_logger
-from llama_stack.models.llama import sku_list
-from llama_stack.models.llama.datatypes import (
-    StopReason,
-    ToolCall,
-    ToolDefinition,
-    ToolPromptFormat,
-)
-from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.remote.inference.vllm.vllm import build_hf_repo_model_entries
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-    ModelsProtocolPrivate,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompatCompletionChoice,
-    OpenAICompatCompletionResponse,
-    OpenAICompletionToLlamaStackMixin,
-    get_stop_reason,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
-
-from .config import VLLMConfig
-from .openai_utils import llama_stack_chat_completion_to_openai_chat_completion_dict
-
-# Map from Hugging Face model architecture name to appropriate tool parser.
-# See vllm.entrypoints.openai.tool_parsers.ToolParserManager.tool_parsers for the full list of
-# available parsers.
-# TODO: Expand this list
-CONFIG_TYPE_TO_TOOL_PARSER = {
-    "GraniteConfig": "granite",
-    "MllamaConfig": "llama3_json",
-    "LlamaConfig": "llama3_json",
-}
-DEFAULT_TOOL_PARSER = "pythonic"
-
-
-logger = get_logger(__name__, category="inference")
-
-
-def _random_uuid_str() -> str:
-    return str(uuid.uuid4().hex)
-
-
-def _response_format_to_guided_decoding_params(
-    response_format: ResponseFormat | None,  # type: ignore
-) -> vllm.sampling_params.GuidedDecodingParams:
-    """
-    Translate constrained decoding parameters from Llama Stack's format to vLLM's format.
-
-    :param response_format: Llama Stack version of constrained decoding info. Can be ``None``,
-     indicating no constraints.
-    :returns: The equivalent dataclass object for the low-level inference layer of vLLM.
-    """
-    if response_format is None:
-        # As of vLLM 0.6.3, the default constructor for GuidedDecodingParams() returns an invalid
-        # value that crashes the executor on some code paths. Use ``None`` instead.
-        return None
-
-    # Llama Stack currently implements fewer types of constrained decoding than vLLM does.
-    # Translate the types that exist and detect if Llama Stack adds new ones.
-    if isinstance(response_format, JsonSchemaResponseFormat):
-        return vllm.sampling_params.GuidedDecodingParams(json=response_format.json_schema)
-    elif isinstance(response_format, GrammarResponseFormat):
-        # BNF grammar.
-        # Llama Stack uses the parse tree of the grammar, while vLLM uses the string
-        # representation of the grammar.
-        raise TypeError(
-            "Constrained decoding with BNF grammars is not currently implemented, because the "
-            "reference implementation does not implement it."
-        )
-    else:
-        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(response_format)}'")
-
-
-def _convert_sampling_params(
-    sampling_params: SamplingParams | None,
-    response_format: ResponseFormat | None,  # type: ignore
-    log_prob_config: LogProbConfig | None,
-) -> vllm.SamplingParams:
-    """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's
-    format."""
-    # In the absence of provided config values, use Llama Stack defaults as encoded in the Llama
-    # Stack dataclasses. These defaults are different from vLLM's defaults.
-    if sampling_params is None:
-        sampling_params = SamplingParams()
-    if log_prob_config is None:
-        log_prob_config = LogProbConfig()
-
-    if isinstance(sampling_params.strategy, TopKSamplingStrategy):
-        if sampling_params.strategy.top_k == 0:
-            # vLLM treats "k" differently for top-k sampling
-            vllm_top_k = -1
-        else:
-            vllm_top_k = sampling_params.strategy.top_k
-    else:
-        vllm_top_k = -1
-
-    if isinstance(sampling_params.strategy, TopPSamplingStrategy):
-        vllm_top_p = sampling_params.strategy.top_p
-        # Llama Stack only allows temperature with top-P.
-        vllm_temperature = sampling_params.strategy.temperature
-    else:
-        vllm_top_p = 1.0
-        vllm_temperature = 0.0
-
-    # vLLM allows top-p and top-k at the same time.
-    vllm_sampling_params = vllm.SamplingParams.from_optional(
-        max_tokens=(None if sampling_params.max_tokens == 0 else sampling_params.max_tokens),
-        temperature=vllm_temperature,
-        top_p=vllm_top_p,
-        top_k=vllm_top_k,
-        repetition_penalty=sampling_params.repetition_penalty,
-        guided_decoding=_response_format_to_guided_decoding_params(response_format),
-        logprobs=log_prob_config.top_k,
-    )
-    return vllm_sampling_params
-
-
-class VLLMInferenceImpl(
-    Inference,
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
-    ModelsProtocolPrivate,
-):
-    """
-    vLLM-based inference model adapter for Llama Stack with support for multiple models.
-
-    Requires the configuration parameters documented in the :class:`VllmConfig2` class.
-    """
-
-    config: VLLMConfig
-    register_helper: ModelRegistryHelper
-    model_ids: set[str]
-    resolved_model_id: str | None
-    engine: AsyncLLMEngine | None
-    chat: OpenAIServingChat | None
-    is_meta_llama_model: bool
-
-    def __init__(self, config: VLLMConfig):
-        self.config = config
-        logger.info(f"Config is: {self.config}")
-
-        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
-        self.formatter = ChatFormat(Tokenizer.get_instance())
-
-        # The following are initialized when paths are bound to this provider
-        self.resolved_model_id = None
-        self.model_ids = set()
-        self.engine = None
-        self.chat = None
-        self.is_meta_llama_model = False
-
-    ###########################################################################
-    # METHODS INHERITED FROM IMPLICIT BASE CLASS.
-    # TODO: Make this class inherit from the new base class ProviderBase once that class exists.
-
-    async def initialize(self) -> None:
-        """
-        Callback that is invoked through many levels of indirection during provider class
-        instantiation, sometime after when __init__() is called and before any model registration
-        methods or methods connected to a REST API are called.
-
-        It's not clear what assumptions the class can make about the platform's initialization
-        state here that can't be made during __init__(), and vLLM can't be started until we know
-        what model it's supposed to be serving, so nothing happens here currently.
-        """
-        pass
-
-    async def shutdown(self) -> None:
-        logger.info(f"Shutting down inline vLLM inference provider {self}.")
-        if self.engine is not None:
-            self.engine.shutdown_background_loop()
-            self.engine = None
-            self.chat = None
-            self.model_ids = set()
-            self.resolved_model_id = None
-
-    ###########################################################################
-    # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE
-
-    # Note that the return type of the superclass method is WRONG
-    async def register_model(self, model: Model) -> Model:
-        """
-        Callback that is called when the server associates an inference endpoint with an
-        inference provider.
-
-        :param model: Object that encapsulates parameters necessary for identifying a specific
-         LLM.
-
-        :returns: The input ``Model`` object. It may or may not be permissible to change fields
-         before returning this object.
-        """
-        logger.debug(f"In register_model({model})")
-
-        # First attempt to interpret the model coordinates as a Llama model name
-        resolved_llama_model = sku_list.resolve_model(model.provider_model_id)
-        if resolved_llama_model is not None:
-            # Load from Hugging Face repo into default local cache dir
-            model_id_for_vllm = resolved_llama_model.huggingface_repo
-
-            # Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
-            # Don't set self.is_meta_llama_model until we actually load the model.
-            is_meta_llama_model = True
-        else:  # if resolved_llama_model is None
-            # Not a Llama model name. Pass the model id through to vLLM's loader
-            model_id_for_vllm = model.provider_model_id
-            is_meta_llama_model = False
-
-        if self.resolved_model_id is not None:
-            if model_id_for_vllm != self.resolved_model_id:
-                raise ValueError(
-                    f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
-                    f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple "
-                    f"copies of the provider instead."
-                )
-            else:
-                # Model already loaded
-                logger.info(
-                    f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing."
-                )
-                self.model_ids.add(model.model_id)
-                return model
-
-        logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.")
-        if is_meta_llama_model:
-            logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
-        self.is_meta_llama_model = is_meta_llama_model
-
-        # If we get here, this is the first time registering a model.
-        # Preload so that the first inference request won't time out.
-        engine_args = AsyncEngineArgs(
-            model=model_id_for_vllm,
-            tokenizer=model_id_for_vllm,
-            tensor_parallel_size=self.config.tensor_parallel_size,
-            enforce_eager=self.config.enforce_eager,
-            gpu_memory_utilization=self.config.gpu_memory_utilization,
-            max_num_seqs=self.config.max_num_seqs,
-            max_model_len=self.config.max_model_len,
-        )
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-        # vLLM currently requires the user to specify the tool parser manually. To choose a tool
-        # parser, we need to determine what model architecture is being used. For now, we infer
-        # that information from what config class the model uses.
-        low_level_model_config = self.engine.engine.get_model_config()
-        hf_config = low_level_model_config.hf_config
-        hf_config_class_name = hf_config.__class__.__name__
-        if hf_config_class_name in CONFIG_TYPE_TO_TOOL_PARSER:
-            tool_parser = CONFIG_TYPE_TO_TOOL_PARSER[hf_config_class_name]
-        else:
-            # No info -- choose a default so we can at least attempt tool
-            # use.
-            tool_parser = DEFAULT_TOOL_PARSER
-        logger.debug(f"{hf_config_class_name=}")
-        logger.debug(f"{tool_parser=}")
-
-        # Wrap the lower-level engine in an OpenAI-compatible chat API
-        model_config = await self.engine.get_model_config()
-        self.chat = OpenAIServingChat(
-            engine_client=self.engine,
-            model_config=model_config,
-            models=OpenAIServingModels(
-                engine_client=self.engine,
-                model_config=model_config,
-                base_model_paths=[
-                    # The layer below us will only see resolved model IDs
-                    BaseModelPath(model_id_for_vllm, model_id_for_vllm)
-                ],
-            ),
-            response_role="assistant",
-            request_logger=None,  # Use default logging
-            chat_template=None,  # Use default template from model checkpoint
-            enable_auto_tools=True,
-            tool_parser=tool_parser,
-            chat_template_content_format="auto",
-        )
-        self.resolved_model_id = model_id_for_vllm
-        self.model_ids.add(model.model_id)
-
-        logger.info(f"Finished preloading model: {model_id_for_vllm}")
-
-        return model
-
-    async def unregister_model(self, model_id: str) -> None:
-        """
-        Callback that is called when the server removes an inference endpoint from an inference
-        provider.
-
-        :param model_id: The same external ID that the higher layers of the stack previously passed
-        to :func:`register_model()`
-        """
-        if model_id not in self.model_ids:
-            raise ValueError(
-                f"Attempted to unregister model ID '{model_id}', but that ID is not registered to this provider."
-            )
-        self.model_ids.remove(model_id)
-
-        if len(self.model_ids) == 0:
-            # Last model was just unregistered. Shut down the connection to vLLM and free up
-            # resources.
-            # Note that this operation may cause in-flight chat completion requests on the
-            # now-unregistered model to return errors.
-            self.resolved_model_id = None
-            self.chat = None
-            self.engine.shutdown_background_loop()
-            self.engine = None
-
-    ###########################################################################
-    # METHODS INHERITED FROM Inference INTERFACE
-
-    async def completion(
-        self,
-        model_id: str,
-        content: InterleavedContent,
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
-        if model_id not in self.model_ids:
-            raise ValueError(
-                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
-            )
-        if not isinstance(content, str):
-            raise NotImplementedError("Multimodal input not currently supported")
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-
-        converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)
-
-        logger.debug(f"{converted_sampling_params=}")
-
-        if stream:
-            return self._streaming_completion(content, converted_sampling_params)
-        else:
-            streaming_result = None
-            async for _ in self._streaming_completion(content, converted_sampling_params):
-                pass
-            return CompletionResponse(
-                content=streaming_result.delta,
-                stop_reason=streaming_result.stop_reason,
-                logprobs=streaming_result.logprobs,
-            )
-
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
-    async def openai_embeddings(
-        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
-    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
-
-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: list[Message],  # type: ignore
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,  # type: ignore
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
-        sampling_params = sampling_params or SamplingParams()
-        if model_id not in self.model_ids:
-            raise ValueError(
-                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
-            )
-
-        # Convert to Llama Stack internal format for consistency
-        request = ChatCompletionRequest(
-            model=self.resolved_model_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            response_format=response_format,
-            tools=tools,
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-
-        if self.is_meta_llama_model:
-            # Bypass vLLM chat templating layer for Meta Llama models, because the
-            # templating layer in Llama Stack currently produces better results.
-            logger.debug(
-                f"Routing {self.resolved_model_id} chat completion through "
-                f"Llama Stack's templating layer instead of vLLM's."
-            )
-            return await self._chat_completion_for_meta_llama(request)
-
-        logger.debug(f"{self.resolved_model_id} is not a Meta Llama model")
-
-        # Arguments to the vLLM call must be packaged as a ChatCompletionRequest dataclass.
-        # Note that this dataclass has the same name as a similar dataclass in Llama Stack.
-        request_options = await llama_stack_chat_completion_to_openai_chat_completion_dict(request)
-        chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
-
-        logger.debug(f"Converted request: {chat_completion_request}")
-
-        vllm_result = await self.chat.create_chat_completion(chat_completion_request)
-        logger.debug(f"Result from vLLM: {vllm_result}")
-        if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
-            raise ValueError(f"Error from vLLM layer: {vllm_result}")
-
-        # Return type depends on "stream" argument
-        if stream:
-            if not isinstance(vllm_result, AsyncGenerator):
-                raise TypeError(f"Unexpected result type {type(vllm_result)} for streaming inference call")
-            # vLLM client returns a stream of strings, which need to be parsed.
-            # Stream comes in the form of an async generator.
-            return self._convert_streaming_results(vllm_result)
-        else:
-            if not isinstance(vllm_result, vllm.entrypoints.openai.protocol.ChatCompletionResponse):
-                raise TypeError(f"Unexpected result type {type(vllm_result)} for non-streaming inference call")
-            return self._convert_non_streaming_results(vllm_result)
-
-    ###########################################################################
-    # INTERNAL METHODS
-
-    async def _streaming_completion(
-        self, content: str, sampling_params: vllm.SamplingParams
-    ) -> AsyncIterator[CompletionResponseStreamChunk]:
-        """Internal implementation of :func:`completion()` API for the streaming case. Assumes
-        that arguments have been validated upstream.
-
-        :param content: Must be a string
-        :param sampling_params: Paramters from  public API's ``response_format``
-         and ``sampling_params`` arguments, converted to VLLM format
-        """
-        # We run agains the vLLM generate() call directly instead of using the OpenAI-compatible
-        # layer, because doing so simplifies the code here.
-
-        # The vLLM engine requires a unique identifier for each call to generate()
-        request_id = _random_uuid_str()
-
-        # The vLLM generate() API is streaming-only and returns an async generator.
-        # The generator returns objects of type vllm.RequestOutput.
-        results_generator = self.engine.generate(content, sampling_params, request_id)
-
-        # Need to know the model's EOS token ID for the conversion code below.
-        # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if
-        # we drill down to the LLMEngine inside the AsyncLLMEngine.
-        # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup,
-        # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup.
-        llm_engine = self.engine.engine
-        tokenizer_group = llm_engine.tokenizer
-        eos_token_id = tokenizer_group.tokenizer.eos_token_id
-
-        request_output: vllm.RequestOutput = None
-        async for request_output in results_generator:
-            # Check for weird inference failures
-            if request_output.outputs is None or len(request_output.outputs) == 0:
-                # This case also should never happen
-                raise ValueError("Inference produced empty result")
-
-            # If we get here, then request_output contains the final output of the generate() call.
-            # The result may include multiple alternate outputs, but Llama Stack APIs only allow
-            # us to return one.
-            output: vllm.CompletionOutput = request_output.outputs[0]
-            completion_string = output.text
-
-            # Convert logprobs from vLLM's format to Llama Stack's format
-            logprobs = [
-                TokenLogProbs(logprobs_by_token={v.decoded_token: v.logprob for _, v in logprob_dict.items()})
-                for logprob_dict in output.logprobs
-            ]
-
-            # The final output chunk should be labeled with the reason that the overall generate()
-            # call completed.
-            logger.debug(f"{output.stop_reason=}; {type(output.stop_reason)=}")
-            if output.stop_reason is None:
-                stop_reason = None  # Still going
-            elif output.stop_reason == "stop":
-                stop_reason = StopReason.end_of_turn
-            elif output.stop_reason == "length":
-                stop_reason = StopReason.out_of_tokens
-            elif isinstance(output.stop_reason, int):
-                # If the model config specifies multiple end-of-sequence tokens, then vLLM
-                # will return the token ID of the EOS token in the stop_reason field.
-                stop_reason = StopReason.end_of_turn
-            else:
-                raise ValueError(f"Unrecognized stop reason '{output.stop_reason}'")
-
-            # vLLM's protocol outputs the stop token, then sets end of message on the next step for
-            # some reason.
-            if request_output.outputs[-1].token_ids[-1] == eos_token_id:
-                stop_reason = StopReason.end_of_message
-
-            yield CompletionResponseStreamChunk(delta=completion_string, stop_reason=stop_reason, logprobs=logprobs)
-
-        # Llama Stack requires that the last chunk have a stop reason, but vLLM doesn't always
-        # provide one if it runs out of tokens.
-        if stop_reason is None:
-            yield CompletionResponseStreamChunk(
-                delta=completion_string,
-                stop_reason=StopReason.out_of_tokens,
-                logprobs=logprobs,
-            )
-
-    def _convert_non_streaming_results(
-        self, vllm_result: vllm.entrypoints.openai.protocol.ChatCompletionResponse
-    ) -> ChatCompletionResponse:
-        """
-        Subroutine to convert the non-streaming output of vLLM's OpenAI-compatible API into an
-        equivalent Llama Stack object.
-
-        The result from vLLM's non-streaming API is a dataclass with the same name as the Llama
-        Stack ChatCompletionResponse dataclass, but with more and different field names. We ignore
-        the fields that aren't currently present in the Llama Stack dataclass.
-        """
-
-        # There may be multiple responses, but we can only pass through the first one.
-        if len(vllm_result.choices) == 0:
-            raise ValueError("Don't know how to convert response object without any responses")
-        vllm_message = vllm_result.choices[0].message
-        vllm_finish_reason = vllm_result.choices[0].finish_reason
-
-        converted_message = CompletionMessage(
-            role=vllm_message.role,
-            # Llama Stack API won't accept None for content field.
-            content=("" if vllm_message.content is None else vllm_message.content),
-            stop_reason=get_stop_reason(vllm_finish_reason),
-            tool_calls=[
-                ToolCall(
-                    call_id=t.id,
-                    tool_name=t.function.name,
-                    # vLLM function args come back as a string. Llama Stack expects JSON.
-                    arguments=json.loads(t.function.arguments),
-                    arguments_json=t.function.arguments,
-                )
-                for t in vllm_message.tool_calls
-            ],
-        )
-
-        # TODO: Convert logprobs
-
-        logger.debug(f"Converted message: {converted_message}")
-
-        return ChatCompletionResponse(
-            completion_message=converted_message,
-        )
-
-    async def _chat_completion_for_meta_llama(
-        self, request: ChatCompletionRequest
-    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
-        """
-        Subroutine that routes chat completions for Meta Llama models through Llama Stack's
-        chat template instead of using vLLM's version of that template. The Llama Stack version
-        of the chat template currently produces more reliable outputs.
-
-        Once vLLM's support for Meta Llama models has matured more, we should consider routing
-        Meta Llama requests through the vLLM chat completions API instead of using this method.
-        """
-        formatter = ChatFormat(Tokenizer.get_instance())
-
-        # Note that this function call modifies `request` in place.
-        prompt = await chat_completion_request_to_prompt(request, self.resolved_model_id)
-
-        model_id = list(self.model_ids)[0]  # Any model ID will do here
-        completion_response_or_iterator = await self.completion(
-            model_id=model_id,
-            content=prompt,
-            sampling_params=request.sampling_params,
-            response_format=request.response_format,
-            stream=request.stream,
-            logprobs=request.logprobs,
-        )
-
-        if request.stream:
-            if not isinstance(completion_response_or_iterator, AsyncIterator):
-                raise TypeError(
-                    f"Received unexpected result type {type(completion_response_or_iterator)}for streaming request."
-                )
-            return self._chat_completion_for_meta_llama_streaming(completion_response_or_iterator, request)
-
-        # elsif not request.stream:
-        if not isinstance(completion_response_or_iterator, CompletionResponse):
-            raise TypeError(
-                f"Received unexpected result type {type(completion_response_or_iterator)}for non-streaming request."
-            )
-        completion_response: CompletionResponse = completion_response_or_iterator
-        raw_message = formatter.decode_assistant_message_from_content(
-            completion_response.content, completion_response.stop_reason
-        )
-        return ChatCompletionResponse(
-            completion_message=CompletionMessage(
-                content=raw_message.content,
-                stop_reason=raw_message.stop_reason,
-                tool_calls=raw_message.tool_calls,
-            ),
-            logprobs=completion_response.logprobs,
-        )
-
-    async def _chat_completion_for_meta_llama_streaming(
-        self, results_iterator: AsyncIterator, request: ChatCompletionRequest
-    ) -> AsyncIterator:
-        """
-        Code from :func:`_chat_completion_for_meta_llama()` that needs to be a separate
-        method to keep asyncio happy.
-        """
-
-        # Convert to OpenAI format, then use shared code to convert to Llama Stack format.
-        async def _generate_and_convert_to_openai_compat():
-            chunk: CompletionResponseStreamChunk  # Make Pylance happy
-            last_text_len = 0
-            async for chunk in results_iterator:
-                if chunk.stop_reason == StopReason.end_of_turn:
-                    finish_reason = "stop"
-                elif chunk.stop_reason == StopReason.end_of_message:
-                    finish_reason = "eos"
-                elif chunk.stop_reason == StopReason.out_of_tokens:
-                    finish_reason = "length"
-                else:
-                    finish_reason = None
-
-                # Convert delta back to an actual delta
-                text_delta = chunk.delta[last_text_len:]
-                last_text_len = len(chunk.delta)
-
-                logger.debug(f"{text_delta=}; {finish_reason=}")
-
-                yield OpenAICompatCompletionResponse(
-                    choices=[OpenAICompatCompletionChoice(finish_reason=finish_reason, text=text_delta)]
-                )
-
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            logger.debug(f"Returning chunk: {chunk}")
-            yield chunk
-
-    async def _convert_streaming_results(self, vllm_result: AsyncIterator) -> AsyncIterator:
-        """
-        Subroutine that wraps the streaming outputs of vLLM's OpenAI-compatible
-        API into a second async iterator that returns Llama Stack objects.
-
-        :param vllm_result: Stream of strings that need to be parsed
-        """
-        # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up
-        # those chunks and output them at the end.
-        # This data structure holds the current set of partial tool calls.
-        index_to_tool_call: dict[int, dict] = dict()
-
-        # The Llama Stack event stream must always start with a start event. Use an empty one to
-        # simplify logic below
-        yield ChatCompletionResponseStreamChunk(
-            event=ChatCompletionResponseEvent(
-                event_type=ChatCompletionResponseEventType.start,
-                delta=TextDelta(text=""),
-                stop_reason=None,
-            )
-        )
-
-        converted_stop_reason = None
-        async for chunk_str in vllm_result:
-            # Due to OpenAI compatibility, each event in the stream will start with "data: " and
-            # end with "\n\n".
-            _prefix = "data: "
-            _suffix = "\n\n"
-            if not chunk_str.startswith(_prefix) or not chunk_str.endswith(_suffix):
-                raise ValueError(f"Can't parse result string from vLLM: '{re.escape(chunk_str)}'")
-
-            # In between the "data: " and newlines is an event record
-            data_str = chunk_str[len(_prefix) : -len(_suffix)]
-
-            # The end of the stream is indicated with "[DONE]"
-            if data_str == "[DONE]":
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=ChatCompletionResponseEventType.complete,
-                        delta=TextDelta(text=""),
-                        stop_reason=converted_stop_reason,
-                    )
-                )
-                return
-
-            # Anything that is not "[DONE]" should be a JSON record
-            parsed_chunk = json.loads(data_str)
-
-            logger.debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
-
-            # The result may contain multiple completions, but Llama Stack APIs only support
-            # returning one.
-            first_choice = parsed_chunk["choices"][0]
-            converted_stop_reason = get_stop_reason(first_choice["finish_reason"])
-            delta_record = first_choice["delta"]
-
-            if "content" in delta_record:
-                # Text delta
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=ChatCompletionResponseEventType.progress,
-                        delta=TextDelta(text=delta_record["content"]),
-                        stop_reason=converted_stop_reason,
-                    )
-                )
-            elif "tool_calls" in delta_record:
-                # Tool call(s). Llama Stack APIs do not have a clear way to return partial tool
-                # calls, so buffer until we get a "tool calls" stop reason
-                for tc in delta_record["tool_calls"]:
-                    index = tc["index"]
-                    if index not in index_to_tool_call:
-                        # First time this tool call is showing up
-                        index_to_tool_call[index] = dict()
-                    tool_call = index_to_tool_call[index]
-                    if "id" in tc:
-                        tool_call["call_id"] = tc["id"]
-                    if "function" in tc:
-                        if "name" in tc["function"]:
-                            tool_call["tool_name"] = tc["function"]["name"]
-                        if "arguments" in tc["function"]:
-                            # Arguments comes in as pieces of a string
-                            if "arguments_str" not in tool_call:
-                                tool_call["arguments_str"] = ""
-                            tool_call["arguments_str"] += tc["function"]["arguments"]
-            else:
-                raise ValueError(f"Don't know how to parse event delta: {delta_record}")
-
-            if first_choice["finish_reason"] == "tool_calls":
-                # Special OpenAI code for "tool calls complete".
-                # Output the buffered tool calls. Llama Stack requires a separate event per tool
-                # call.
-                for tool_call_record in index_to_tool_call.values():
-                    # Arguments come in as a string. Parse the completed string.
-                    tool_call_record["arguments"] = json.loads(tool_call_record["arguments_str"])
-                    del tool_call_record["arguments_str"]
-
-                    yield ChatCompletionResponseStreamChunk(
-                        event=ChatCompletionResponseEvent(
-                            event_type=ChatCompletionResponseEventType.progress,
-                            delta=ToolCallDelta(tool_call=tool_call_record, parse_status="succeeded"),
-                            stop_reason=converted_stop_reason,
-                        )
-                    )
-
-        # If we get here, we've lost the connection with the vLLM event stream before it ended
-        # normally.
-        raise ValueError("vLLM event stream ended without [DONE] message.")
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -181,8 +181,8 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
            )
            self.cache[vector_db.identifier] = index

-        # Load existing OpenAI vector stores using the mixin method
-        self.openai_vector_stores = await self._load_openai_vector_stores()
+        # Load existing OpenAI vector stores into the in-memory cache
+        await self.initialize_openai_vector_stores()

    async def shutdown(self) -> None:
        # Cleanup if needed
@ -261,42 +261,6 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr

        return await index.query_chunks(query, params)

-    # OpenAI Vector Store Mixin abstract method implementations
-    async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
-        """Save vector store metadata to kvstore."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.set(key=key, value=json.dumps(store_info))
-        self.openai_vector_stores[store_id] = store_info
-
-    async def _load_openai_vector_stores(self) -> dict[str, dict[str, Any]]:
-        """Load all vector store metadata from kvstore."""
-        assert self.kvstore is not None
-        start_key = OPENAI_VECTOR_STORES_PREFIX
-        end_key = f"{OPENAI_VECTOR_STORES_PREFIX}\xff"
-        stored_openai_stores = await self.kvstore.values_in_range(start_key, end_key)
-
-        stores = {}
-        for store_data in stored_openai_stores:
-            store_info = json.loads(store_data)
-            stores[store_info["id"]] = store_info
-        return stores
-
-    async def _update_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
-        """Update vector store metadata in kvstore."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.set(key=key, value=json.dumps(store_info))
-        self.openai_vector_stores[store_id] = store_info
-
-    async def _delete_openai_vector_store_from_storage(self, store_id: str) -> None:
-        """Delete vector store metadata from kvstore."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.delete(key)
-        if store_id in self.openai_vector_stores:
-            del self.openai_vector_stores[store_id]
-
    async def _save_openai_vector_store_file(
        self, store_id: str, file_id: str, file_info: dict[str, Any], file_contents: list[dict[str, Any]]
    ) -> None:
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -7,6 +7,7 @@
 import asyncio
 import json
 import logging
+import re
 import sqlite3
 import struct
 from typing import Any
@ -117,6 +118,10 @@ def _rrf_rerank(
    return rrf_scores


+def _make_sql_identifier(name: str) -> str:
+    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
+
+
 class SQLiteVecIndex(EmbeddingIndex):
    """
    An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@ -130,9 +135,9 @@ class SQLiteVecIndex(EmbeddingIndex):
        self.dimension = dimension
        self.db_path = db_path
        self.bank_id = bank_id
-        self.metadata_table = f"chunks_{bank_id}".replace("-", "_")
-        self.vector_table = f"vec_chunks_{bank_id}".replace("-", "_")
-        self.fts_table = f"fts_chunks_{bank_id}".replace("-", "_")
+        self.metadata_table = _make_sql_identifier(f"chunks_{bank_id}")
+        self.vector_table = _make_sql_identifier(f"vec_chunks_{bank_id}")
+        self.fts_table = _make_sql_identifier(f"fts_chunks_{bank_id}")
        self.kvstore = kvstore

    @classmethod
@ -148,14 +153,14 @@ class SQLiteVecIndex(EmbeddingIndex):
            try:
                # Create the table to store chunk metadata.
                cur.execute(f"""
-                    CREATE TABLE IF NOT EXISTS {self.metadata_table} (
+                    CREATE TABLE IF NOT EXISTS [{self.metadata_table}] (
                        id TEXT PRIMARY KEY,
                        chunk TEXT
                    );
                """)
                # Create the virtual table for embeddings.
                cur.execute(f"""
-                    CREATE VIRTUAL TABLE IF NOT EXISTS {self.vector_table}
+                    CREATE VIRTUAL TABLE IF NOT EXISTS [{self.vector_table}]
                    USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
                """)
                connection.commit()
@ -163,7 +168,7 @@ class SQLiteVecIndex(EmbeddingIndex):
                # based on query. Implementation of the change on client side will allow passing the search_mode option
                # during initialization to make it easier to create the table that is required.
                cur.execute(f"""
-                            CREATE VIRTUAL TABLE IF NOT EXISTS {self.fts_table}
+                            CREATE VIRTUAL TABLE IF NOT EXISTS [{self.fts_table}]
                            USING fts5(id, content);
                        """)
                connection.commit()
@ -178,9 +183,9 @@ class SQLiteVecIndex(EmbeddingIndex):
            connection = _create_sqlite_connection(self.db_path)
            cur = connection.cursor()
            try:
-                cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
-                cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
-                cur.execute(f"DROP TABLE IF EXISTS {self.fts_table};")
+                cur.execute(f"DROP TABLE IF EXISTS [{self.metadata_table}];")
+                cur.execute(f"DROP TABLE IF EXISTS [{self.vector_table}];")
+                cur.execute(f"DROP TABLE IF EXISTS [{self.fts_table}];")
                connection.commit()
            finally:
                cur.close()
@ -212,7 +217,7 @@ class SQLiteVecIndex(EmbeddingIndex):
                    metadata_data = [(chunk.chunk_id, chunk.model_dump_json()) for chunk in batch_chunks]
                    cur.executemany(
                        f"""
-                        INSERT INTO {self.metadata_table} (id, chunk)
+                        INSERT INTO [{self.metadata_table}] (id, chunk)
                        VALUES (?, ?)
                        ON CONFLICT(id) DO UPDATE SET chunk = excluded.chunk;
                        """,
@ -230,7 +235,7 @@ class SQLiteVecIndex(EmbeddingIndex):
                        for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
                    ]
                    cur.executemany(
-                        f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);",
+                        f"INSERT INTO [{self.vector_table}] (id, embedding) VALUES (?, ?);",
                        embedding_data,
                    )

@ -238,13 +243,13 @@ class SQLiteVecIndex(EmbeddingIndex):
                    fts_data = [(chunk.chunk_id, chunk.content) for chunk in batch_chunks]
                    # DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT)
                    cur.executemany(
-                        f"DELETE FROM {self.fts_table} WHERE id = ?;",
+                        f"DELETE FROM [{self.fts_table}] WHERE id = ?;",
                        [(row[0],) for row in fts_data],
                    )

                    # INSERT new entries
                    cur.executemany(
-                        f"INSERT INTO {self.fts_table} (id, content) VALUES (?, ?);",
+                        f"INSERT INTO [{self.fts_table}] (id, content) VALUES (?, ?);",
                        fts_data,
                    )

@ -280,8 +285,8 @@ class SQLiteVecIndex(EmbeddingIndex):
                emb_blob = serialize_vector(emb_list)
                query_sql = f"""
                    SELECT m.id, m.chunk, v.distance
-                    FROM {self.vector_table} AS v
-                    JOIN {self.metadata_table} AS m ON m.id = v.id
+                    FROM [{self.vector_table}] AS v
+                    JOIN [{self.metadata_table}] AS m ON m.id = v.id
                    WHERE v.embedding MATCH ? AND k = ?
                    ORDER BY v.distance;
                """
@ -322,9 +327,9 @@ class SQLiteVecIndex(EmbeddingIndex):
            cur = connection.cursor()
            try:
                query_sql = f"""
-                    SELECT DISTINCT m.id, m.chunk, bm25({self.fts_table}) AS score
-                    FROM {self.fts_table} AS f
-                    JOIN {self.metadata_table} AS m ON m.id = f.id
+                    SELECT DISTINCT m.id, m.chunk, bm25([{self.fts_table}]) AS score
+                    FROM [{self.fts_table}] AS f
+                    JOIN [{self.metadata_table}] AS m ON m.id = f.id
                    WHERE f.content MATCH ?
                    ORDER BY score ASC
                    LIMIT ?;
@ -452,8 +457,8 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
            )
            self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

-        # load any existing OpenAI vector stores
-        self.openai_vector_stores = await self._load_openai_vector_stores()
+        # Load existing OpenAI vector stores into the in-memory cache
+        await self.initialize_openai_vector_stores()

    async def shutdown(self) -> None:
        # nothing to do since we don't maintain a persistent connection
@ -501,41 +506,6 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
        await self.cache[vector_db_id].index.delete()
        del self.cache[vector_db_id]

-    # OpenAI Vector Store Mixin abstract method implementations
-    async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
-        """Save vector store metadata to SQLite database."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.set(key=key, value=json.dumps(store_info))
-        self.openai_vector_stores[store_id] = store_info
-
-    async def _load_openai_vector_stores(self) -> dict[str, dict[str, Any]]:
-        """Load all vector store metadata from SQLite database."""
-        assert self.kvstore is not None
-        start_key = OPENAI_VECTOR_STORES_PREFIX
-        end_key = f"{OPENAI_VECTOR_STORES_PREFIX}\xff"
-        stored_openai_stores = await self.kvstore.values_in_range(start_key, end_key)
-        stores = {}
-        for store_data in stored_openai_stores:
-            store_info = json.loads(store_data)
-            stores[store_info["id"]] = store_info
-        return stores
-
-    async def _update_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
-        """Update vector store metadata in SQLite database."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.set(key=key, value=json.dumps(store_info))
-        self.openai_vector_stores[store_id] = store_info
-
-    async def _delete_openai_vector_store_from_storage(self, store_id: str) -> None:
-        """Delete vector store metadata from SQLite database."""
-        assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
-        await self.kvstore.delete(key)
-        if store_id in self.openai_vector_stores:
-            del self.openai_vector_stores[store_id]
-
    async def _save_openai_vector_store_file(
        self, store_id: str, file_id: str, file_info: dict[str, Any], file_contents: list[dict[str, Any]]
    ) -> None:
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -37,16 +37,6 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceInferenceConfig",
            description="Meta's reference implementation of inference with support for various model formats and optimization techniques.",
        ),
-        InlineProviderSpec(
-            api=Api.inference,
-            provider_type="inline::vllm",
-            pip_packages=[
-                "vllm",
-            ],
-            module="llama_stack.providers.inline.inference.vllm",
-            config_class="llama_stack.providers.inline.inference.vllm.VLLMConfig",
-            description="vLLM inference provider for high-performance model serving with PagedAttention and continuous batching.",
-        ),
        InlineProviderSpec(
            api=Api.inference,
            provider_type="inline::sentence-transformers",
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -3,16 +3,17 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import logging

-from llama_stack.providers.remote.inference.llama_openai_compat.config import (
-    LlamaCompatConfig,
-)
-from llama_stack.providers.utils.inference.litellm_openai_mixin import (
-    LiteLLMOpenAIMixin,
-)
+from llama_api_client import AsyncLlamaAPIClient, NotFoundError
+
+from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin

 from .models import MODEL_ENTRIES

+logger = logging.getLogger(__name__)
+

 class LlamaCompatInferenceAdapter(LiteLLMOpenAIMixin):
    _config: LlamaCompatConfig
@ -27,8 +28,32 @@ class LlamaCompatInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config

+    async def check_model_availability(self, model: str) -> bool:
+        """
+        Check if a specific model is available from Llama API.
+
+        :param model: The model identifier to check.
+        :return: True if the model is available dynamically, False otherwise.
+        """
+        try:
+            llama_api_client = self._get_llama_api_client()
+            retrieved_model = await llama_api_client.models.retrieve(model)
+            logger.info(f"Model {retrieved_model.id} is available from Llama API")
+            return True
+
+        except NotFoundError:
+            logger.error(f"Model {model} is not available from Llama API")
+            return False
+
+        except Exception as e:
+            logger.error(f"Failed to check model availability from Llama API: {e}")
+            return False
+
    async def initialize(self):
        await super().initialize()

    async def shutdown(self):
        await super().shutdown()
+
+    def _get_llama_api_client(self) -> AsyncLlamaAPIClient:
+        return AsyncLlamaAPIClient(api_key=self.get_api_key(), base_url=self.config.openai_compat_api_base)
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -7,10 +7,9 @@
 import logging
 import warnings
 from collections.abc import AsyncIterator
-from functools import lru_cache
 from typing import Any

-from openai import APIConnectionError, AsyncOpenAI, BadRequestError
+from openai import APIConnectionError, AsyncOpenAI, BadRequestError, NotFoundError

 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -41,11 +40,7 @@ from llama_stack.apis.inference import (
    ToolChoice,
    ToolConfig,
 )
-from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolPromptFormat
-from llama_stack.providers.utils.inference import (
-    ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR,
-)
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
@ -93,41 +88,37 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):

        self._config = config

-    @lru_cache  # noqa: B019
-    def _get_client(self, provider_model_id: str) -> AsyncOpenAI:
+    async def check_model_availability(self, model: str) -> bool:
        """
-        For hosted models, https://integrate.api.nvidia.com/v1 is the primary base_url. However,
-        some models are hosted on different URLs. This function returns the appropriate client
-        for the given provider_model_id.
+        Check if a specific model is available.

-        This relies on lru_cache and self._default_client to avoid creating a new client for each request
-        or for each model that is hosted on https://integrate.api.nvidia.com/v1.
+        :param model: The model identifier to check.
+        :return: True if the model is available dynamically, False otherwise.
+        """
+        try:
+            await self._client.models.retrieve(model)
+            return True
+        except NotFoundError:
+            logger.error(f"Model {model} is not available")
+        except Exception as e:
+            logger.error(f"Failed to check model availability: {e}")
+        return False
+
+    @property
+    def _client(self) -> AsyncOpenAI:
+        """
+        Returns an OpenAI client for the configured NVIDIA API endpoint.

-        :param provider_model_id: The provider model ID
        :return: An OpenAI client
        """

-        @lru_cache  # noqa: B019
-        def _get_client_for_base_url(base_url: str) -> AsyncOpenAI:
-            """
-            Maintain a single OpenAI client per base_url.
-            """
-            return AsyncOpenAI(
-                base_url=base_url,
-                api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
-                timeout=self._config.timeout,
-            )
-
-        special_model_urls = {
-            "meta/llama-3.2-11b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct",
-            "meta/llama-3.2-90b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct",
-        }
-
        base_url = f"{self._config.url}/v1" if self._config.append_api_version else self._config.url

-        if _is_nvidia_hosted(self._config) and provider_model_id in special_model_urls:
-            base_url = special_model_urls[provider_model_id]
-        return _get_client_for_base_url(base_url)
+        return AsyncOpenAI(
+            base_url=base_url,
+            api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
+            timeout=self._config.timeout,
+        )

    async def _get_provider_model_id(self, model_id: str) -> str:
        if not self.model_store:
@ -169,7 +160,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            response = await self._get_client(provider_model_id).completions.create(**request)
+            response = await self._client.completions.create(**request)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

@ -222,7 +213,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
            extra_body["input_type"] = task_type_options[task_type]

        try:
-            response = await self._get_client(provider_model_id).embeddings.create(
+            response = await self._client.embeddings.create(
                model=provider_model_id,
                input=input,
                extra_body=extra_body,
@ -283,7 +274,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            response = await self._get_client(provider_model_id).chat.completions.create(**request)
+            response = await self._client.chat.completions.create(**request)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

@ -339,7 +330,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            return await self._get_client(provider_model_id).completions.create(**params)
+            return await self._client.completions.create(**params)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

@ -398,47 +389,6 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            return await self._get_client(provider_model_id).chat.completions.create(**params)
+            return await self._client.chat.completions.create(**params)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
-
-    async def register_model(self, model: Model) -> Model:
-        """
-        Allow non-llama model registration.
-
-        Non-llama model registration: API Catalogue models, post-training models, etc.
-            client = LlamaStackAsLibraryClient("nvidia")
-            client.models.register(
-                    model_id="mistralai/mixtral-8x7b-instruct-v0.1",
-                    model_type=ModelType.llm,
-                    provider_id="nvidia",
-                    provider_model_id="mistralai/mixtral-8x7b-instruct-v0.1"
-            )
-
-            NOTE: Only supports models endpoints compatible with AsyncOpenAI base_url format.
-        """
-        if model.model_type == ModelType.embedding:
-            # embedding models are always registered by their provider model id and does not need to be mapped to a llama model
-            provider_resource_id = model.provider_resource_id
-        else:
-            provider_resource_id = self.get_provider_model_id(model.provider_resource_id)
-
-        if provider_resource_id:
-            model.provider_resource_id = provider_resource_id
-        else:
-            llama_model = model.metadata.get("llama_model")
-            existing_llama_model = self.get_llama_model(model.provider_resource_id)
-            if existing_llama_model:
-                if existing_llama_model != llama_model:
-                    raise ValueError(
-                        f"Provider model id '{model.provider_resource_id}' is already registered to a different llama model: '{existing_llama_model}'"
-                    )
-            else:
-                # not llama model
-                if llama_model in ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR:
-                    self.provider_id_to_llama_model_map[model.provider_resource_id] = (
-                        ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[llama_model]
-                    )
-                else:
-                    self.alias_to_provider_id_map[model.provider_model_id] = model.provider_model_id
-        return model
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -6,13 +6,15 @@

 from typing import Any

-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 DEFAULT_OLLAMA_URL = "http://localhost:11434"


 class OllamaImplConfig(BaseModel):
    url: str = DEFAULT_OLLAMA_URL
+    refresh_models: bool = Field(default=False, description="refresh and re-register models periodically")
+    refresh_models_interval: int = Field(default=300, description="interval in seconds to refresh models")

    @classmethod
    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
--- a/llama_stack/providers/remote/inference/ollama/models.py
+++ b/llama_stack/providers/remote/inference/ollama/models.py
@ -12,6 +12,19 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_model_entry,
 )

+SAFETY_MODELS_ENTRIES = [
+    # The Llama Guard models don't have their full fp16 versions
+    # so we are going to alias their default version to the canonical SKU
+    build_hf_repo_model_entry(
+        "llama-guard3:8b",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama-guard3:1b",
+        CoreModelId.llama_guard_3_1b.value,
+    ),
+]
+
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "llama3.1:8b-instruct-fp16",
@ -73,16 +86,6 @@ MODEL_ENTRIES = [
        "llama3.3:70b",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
-    # The Llama Guard models don't have their full fp16 versions
-    # so we are going to alias their default version to the canonical SKU
-    build_hf_repo_model_entry(
-        "llama-guard3:8b",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-    build_hf_repo_model_entry(
-        "llama-guard3:1b",
-        CoreModelId.llama_guard_3_1b.value,
-    ),
    ProviderModelEntry(
        provider_model_id="all-minilm:l6-v2",
        aliases=["all-minilm"],
@ -100,4 +103,4 @@ MODEL_ENTRIES = [
            "context_length": 8192,
        },
    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.


+import asyncio
 import base64
 import uuid
 from collections.abc import AsyncGenerator, AsyncIterator
@ -89,23 +90,88 @@ class OllamaInferenceAdapter(
    InferenceProvider,
    ModelRegistryHelper,
 ):
+    # automatically set by the resolver when instantiating the provider
+    __provider_id__: str
+
    def __init__(self, config: OllamaImplConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
-        self.url = config.url
+        self.config = config
+        self._client = None
+        self._openai_client = None

    @property
    def client(self) -> AsyncClient:
-        return AsyncClient(host=self.url)
+        if self._client is None:
+            self._client = AsyncClient(host=self.config.url)
+        return self._client

    @property
    def openai_client(self) -> AsyncOpenAI:
-        return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
+        if self._openai_client is None:
+            self._openai_client = AsyncOpenAI(base_url=f"{self.config.url}/v1", api_key="ollama")
+        return self._openai_client

    async def initialize(self) -> None:
-        logger.debug(f"checking connectivity to Ollama at `{self.url}`...")
+        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
        health_response = await self.health()
        if health_response["status"] == HealthStatus.ERROR:
-            raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
+            logger.warning(
+                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
+            )
+
+        if self.config.refresh_models:
+            logger.debug("ollama starting background model refresh task")
+            self._refresh_task = asyncio.create_task(self._refresh_models())
+
+            def cb(task):
+                if task.cancelled():
+                    import traceback
+
+                    logger.error(f"ollama background refresh task canceled:\n{''.join(traceback.format_stack())}")
+                elif task.exception():
+                    logger.error(f"ollama background refresh task died: {task.exception()}")
+                else:
+                    logger.error("ollama background refresh task completed unexpectedly")
+
+            self._refresh_task.add_done_callback(cb)
+
+    async def _refresh_models(self) -> None:
+        # Wait for model store to be available (with timeout)
+        waited_time = 0
+        while not self.model_store and waited_time < 60:
+            await asyncio.sleep(1)
+            waited_time += 1
+
+        if not self.model_store:
+            raise ValueError("Model store not set after waiting 60 seconds")
+
+        provider_id = self.__provider_id__
+        while True:
+            try:
+                response = await self.client.list()
+            except Exception as e:
+                logger.warning(f"Failed to list models: {str(e)}")
+                await asyncio.sleep(self.config.refresh_models_interval)
+                continue
+
+            models = []
+            for m in response.models:
+                model_type = ModelType.embedding if m.details.family in ["bert"] else ModelType.llm
+                if model_type == ModelType.embedding:
+                    continue
+                models.append(
+                    Model(
+                        identifier=m.model,
+                        provider_resource_id=m.model,
+                        provider_id=provider_id,
+                        metadata={},
+                        model_type=model_type,
+                    )
+                )
+            await self.model_store.update_registered_llm_models(provider_id, models)
+            logger.debug(f"ollama refreshed model list ({len(models)} models)")
+
+            await asyncio.sleep(self.config.refresh_models_interval)

    async def health(self) -> HealthResponse:
        """
@ -157,7 +223,12 @@ class OllamaInferenceAdapter(
        return available_models

    async def shutdown(self) -> None:
-        pass
+        if hasattr(self, "_refresh_task") and not self._refresh_task.done():
+            logger.debug("ollama cancelling background refresh task")
+            self._refresh_task.cancel()
+
+        self._client = None
+        self._openai_client = None

    async def unregister_model(self, model_id: str) -> None:
        pass
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -8,7 +8,7 @@ import logging
 from collections.abc import AsyncIterator
 from typing import Any

-from openai import AsyncOpenAI
+from openai import AsyncOpenAI, NotFoundError

 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
@ -60,6 +60,27 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
        # litellm specific model names, an abstraction leak.
        self.is_openai_compat = True

+    async def check_model_availability(self, model: str) -> bool:
+        """
+        Check if a specific model is available from OpenAI.
+
+        :param model: The model identifier to check.
+        :return: True if the model is available dynamically, False otherwise.
+        """
+        try:
+            openai_client = self._get_openai_client()
+            retrieved_model = await openai_client.models.retrieve(model)
+            logger.info(f"Model {retrieved_model.id} is available from OpenAI")
+            return True
+
+        except NotFoundError:
+            logger.error(f"Model {model} is not available from OpenAI")
+            return False
+
+        except Exception as e:
+            logger.error(f"Failed to check model availability from OpenAI: {e}")
+            return False
+
    async def initialize(self) -> None:
        await super().initialize()

--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -29,6 +29,14 @@ class VLLMInferenceAdapterConfig(BaseModel):
        default=True,
        description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
    )
+    refresh_models: bool = Field(
+        default=False,
+        description="Whether to refresh models periodically",
+    )
+    refresh_models_interval: int = Field(
+        default=300,
+        description="Interval in seconds to refresh models",
+    )

    @field_validator("tls_verify")
    @classmethod
@ -46,7 +54,7 @@ class VLLMInferenceAdapterConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL}",
+        url: str = "${env.VLLM_URL:=}",
        **kwargs,
    ):
        return {
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -3,8 +3,8 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
 import json
-import logging
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
+    ModelStore,
    OpenAIChatCompletion,
    OpenAICompletion,
    OpenAIEmbeddingData,
@ -54,6 +55,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model, ModelType
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import (
@ -84,7 +86,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (

 from .config import VLLMInferenceAdapterConfig

-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="inference")


 def build_hf_repo_model_entries():
@ -288,16 +290,76 @@ async def _process_vllm_chat_completion_stream_response(


 class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
+    # automatically set by the resolver when instantiating the provider
+    __provider_id__: str
+    model_store: ModelStore | None = None
+    _refresh_task: asyncio.Task | None = None
+
    def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
        self.config = config
        self.client = None

    async def initialize(self) -> None:
-        pass
+        if not self.config.url:
+            # intentionally don't raise an error here, we want to allow the provider to be "dormant"
+            # or available in distributions like "starter" without causing a ruckus
+            return
+
+        if self.config.refresh_models:
+            self._refresh_task = asyncio.create_task(self._refresh_models())
+
+            def cb(task):
+                import traceback
+
+                if task.cancelled():
+                    log.error(f"vLLM background refresh task canceled:\n{''.join(traceback.format_stack())}")
+                elif task.exception():
+                    # print the stack trace for the exception
+                    exc = task.exception()
+                    log.error(f"vLLM background refresh task died: {exc}")
+                    traceback.print_exception(exc)
+                else:
+                    log.error("vLLM background refresh task completed unexpectedly")
+
+            self._refresh_task.add_done_callback(cb)
+
+    async def _refresh_models(self) -> None:
+        provider_id = self.__provider_id__
+        waited_time = 0
+        while not self.model_store and waited_time < 60:
+            await asyncio.sleep(1)
+            waited_time += 1
+
+        if not self.model_store:
+            raise ValueError("Model store not set after waiting 60 seconds")
+
+        self._lazy_initialize_client()
+        assert self.client is not None  # mypy
+        while True:
+            try:
+                models = []
+                async for m in self.client.models.list():
+                    model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
+                    models.append(
+                        Model(
+                            identifier=m.id,
+                            provider_resource_id=m.id,
+                            provider_id=provider_id,
+                            metadata={},
+                            model_type=model_type,
+                        )
+                    )
+                await self.model_store.update_registered_llm_models(provider_id, models)
+                log.debug(f"vLLM refreshed model list ({len(models)} models)")
+            except Exception as e:
+                log.error(f"vLLM background refresh task failed: {e}")
+            await asyncio.sleep(self.config.refresh_models_interval)

    async def shutdown(self) -> None:
-        pass
+        if self._refresh_task:
+            self._refresh_task.cancel()
+            self._refresh_task = None

    async def unregister_model(self, model_id: str) -> None:
        pass
@ -312,6 +374,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            HealthResponse: A dictionary containing the health status.
        """
        try:
+            if not self.config.url:
+                return HealthResponse(status=HealthStatus.ERROR, message="vLLM URL is not set")
+
            client = self._create_client() if self.client is None else self.client
            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
            return HealthResponse(status=HealthStatus.OK)
@ -327,6 +392,11 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        if self.client is not None:
            return

+        if not self.config.url:
+            raise ValueError(
+                "You must provide a vLLM URL in the run.yaml file (or set the VLLM_URL environment variable)"
+            )
+
        log.info(f"Initializing vLLM client with base_url={self.config.url}")
        self.client = self._create_client()

--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -217,7 +217,6 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        embedding_model: str | None = None,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")

--- a/llama_stack/providers/remote/vector_io/milvus/config.py
+++ b/llama_stack/providers/remote/vector_io/milvus/config.py
@ -8,7 +8,7 @@ from typing import Any

 from pydantic import BaseModel, ConfigDict, Field

-from llama_stack.providers.utils.kvstore.config import KVStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.schema_utils import json_schema_type


@ -17,7 +17,7 @@ class MilvusVectorIOConfig(BaseModel):
    uri: str = Field(description="The URI of the Milvus server")
    token: str | None = Field(description="The token of the Milvus server")
    consistency_level: str = Field(description="The consistency level of the Milvus server", default="Strong")
-    kvstore: KVStoreConfig | None = Field(description="Config for KV store backend (SQLite only for now)", default=None)
+    kvstore: KVStoreConfig = Field(description="Config for KV store backend")

    # This configuration allows additional fields to be passed through to the underlying Milvus client.
    # See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general.
@ -25,4 +25,11 @@ class MilvusVectorIOConfig(BaseModel):

    @classmethod
    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {"uri": "${env.MILVUS_ENDPOINT}", "token": "${env.MILVUS_TOKEN}"}
+        return {
+            "uri": "${env.MILVUS_ENDPOINT}",
+            "token": "${env.MILVUS_TOKEN}",
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="milvus_remote_registry.db",
+            ),
+        }
--- a/Show more
+++ b/Show more