Merge branch 'main' into chroma

2025-12-03 09:53:45 +00:00 · 2025-10-22 12:44:43 +09:00 · 2025-10-22 12:44:43 +09:00 · 470adfc2df
commit 470adfc2df
parent 85a42cfe1a 53c20f6113
750 changed files with 243399 additions and 28283 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,19 @@
 .venv
 __pycache__
 *.pyc
 *.pyo
 *.pyd
 *.so
 .git
 .gitignore
 htmlcov*
 .coverage
 coverage*
 .cache
 .mypy_cache
 .pytest_cache
 .ruff_cache
 uv.lock
 node_modules
 build
 /tmp
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
 tests/**/recordings/** linguist-generated=true
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -82,11 +82,13 @@ runs:
          echo "No recording changes"
        fi
-    - name: Write inference logs to file
+    - name: Write docker logs to file
      if: ${{ always() }}
      shell: bash
      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        # Ollama logs (if ollama container exists)
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
    - name: Upload logs
      if: ${{ always() }}
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -57,7 +57,7 @@ runs:
        echo "Building Llama Stack"
        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
    - name: Configure git for commits
      shell: bash
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -14,6 +14,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -30,8 +30,11 @@ jobs:
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
+          docker build . \
-            llama stack build --template starter --image-type container --image-name test
+            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=starter \
            --tag llama-stack:starter-ci
      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -73,6 +73,24 @@ jobs:
          image_name: kube
          apis: []
          providers: {}
          storage:
            backends:
              kv_default:
                type: kv_sqlite
                db_path: $run_dir/kvstore.db
              sql_default:
                type: sql_sqlite
                db_path: $run_dir/sql_store.db
            stores:
              metadata:
                namespace: registry
                backend: kv_default
              inference:
                table_name: inference_store
                backend: sql_default
              conversations:
                table_name: openai_conversations
                backend: sql_default
          server:
            port: 8321
          EOF
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -47,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, server]
+        client-type: [library, server, docker]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
@ -82,7 +82,7 @@ jobs:
        env:
          OPENAI_API_KEY: dummy
        with:
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -144,7 +144,7 @@ jobs:
      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -169,9 +169,7 @@ jobs:
        run: |
          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io \
+            tests/integration/vector_io
            --embedding-model nomic-ai/nomic-embed-text-v1.5 \
            --embedding-dimension 768
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml
      - name: Set up Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -99,7 +99,7 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
+              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
            });
      - name: Checkout PR branch (same-repo)
@ -141,7 +141,7 @@ jobs:
      - name: Set up Node.js
        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -14,6 +14,8 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
  pull_request:
    paths:
@ -24,6 +26,8 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -60,15 +64,19 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Print build dependencies
+      - name: Install distribution into venv
        if: matrix.image-type == 'venv'
        run: |
-          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
-      - name: Run Llama Stack Build
+      - name: Build container image
        if: matrix.image-type == 'container'
        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          docker build . \
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+            -f containers/Containerfile \
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
+            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=${{ matrix.distro }} \
            --tag llama-stack:${{ matrix.distro }}-ci
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -86,8 +94,8 @@ jobs:
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
+          uv pip install -e .
-
+          uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
@ -97,11 +105,16 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Build a single provider
+      - name: Build container image
        run: |
-          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
-          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
+          docker build . \
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests
      - name: Inspect the container image entrypoint
        run: |
@ -112,7 +125,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -129,17 +142,19 @@ jobs:
      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/distributions/ci-tests/build.yaml
-      - name: Build dev container (UBI9)
+      - name: Build UBI9 container image
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
          docker build . \
            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests-ubi9
      - name: Inspect UBI9 image
        run: |
@ -150,7 +165,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -0,0 +1,105 @@
 name: Test llama stack list-deps
 run-name: Test llama stack list-deps
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/list_deps.py'
      - 'llama_stack/cli/stack/_list_deps.py'
      - 'llama_stack/core/build.*'
      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
      - 'llama_stack/templates/**'
      - 'pyproject.toml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/list_deps.py'
      - 'llama_stack/cli/stack/_list_deps.py'
      - 'llama_stack/core/build.*'
      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
      - 'llama_stack/templates/**'
      - 'pyproject.toml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Generate Distribution List
        id: set-matrix
        run: |
          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  list-deps:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Print dependencies
        run: |
          uv run llama stack list-deps ${{ matrix.distro }}
      - name: Install Distro using llama stack list-deps
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          uv pip list
  show-single-provider:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Show a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
  list-deps-from-config:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: list-des from Config
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
          uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
    - name: Install uv
-      uses: astral-sh/setup-uv@eb1897b8dc4b5d5bfe39a428a8f2304605e0983c # v7.0.0
+      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -46,9 +46,9 @@ jobs:
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
          cat tests/external/ramalama-stack/run.yaml
-      - name: Build distro from config file
+      - name: Install distribution dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
+          uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -44,11 +44,14 @@ jobs:
      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
+          uv run --no-sync llama stack list-deps tests/external/build.yaml
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
+          uv venv ci-test
          source ci-test/bin/activate
          uv pip install -e .
          LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,14 +11,17 @@ You can install the dependencies by running:
 ```bash
 cd llama-stack
 uv venv --python 3.12
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```
 ```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
+If you are making changes to Llama Stack, it is essential that you use Python 3.12 as shown above.
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+Llama Stack can work with Python 3.13 but the pre-commit hooks used to validate code changes only work with Python 3.12.
 If you don't specify a Python version, `uv` will automatically select a Python version according to the `requires-python`
 section of the `pyproject.toml`, which is fine for running Llama Stack but not for committing changes.
 For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ```
@ -42,17 +45,22 @@ uv run --env-file .env -- pytest -v tests/integration/inference/test_text_infere
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
 ```bash
 uv pip install pre-commit==4.3.0
 uv run pre-commit install
 ```
-After that, pre-commit hooks will run automatically before each commit.
+Note that the only version of pre-commit that works with the Llama Stack continuous integration is `4.3.0` so it is essential that you pull
 that specific version as shown above.  Once you have run these commands, pre-commit hooks will run automatically before each commit.
-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+Alternatively, if you don't want to install the pre-commit hooks (or if you want to check if your changes are ready before committing),
 you can run the checks manually by running:
 ```bash
-uv run pre-commit run --all-files
+uv run pre-commit run --all-files -v
 ```
 The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
@ -83,6 +91,7 @@ If you are new to the project, start by looking at the issues tagged with "good
 leave a comment on the issue and a triager will assign it to you.
 Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
 - Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
 - Before taking an issue, check if it’s already assigned or being actively discussed.
 - If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
@ -158,9 +167,9 @@ under the LICENSE file in the root directory of this source tree.
 Some tips about common tasks you work on while contributing to Llama Stack:
-### Using `llama stack build`
+### Installing dependencies of distributions
-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.
 Example:
 ```bash
@ -168,7 +177,12 @@ cd work/
 git clone https://github.com/llamastack/llama-stack.git
 git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+
 # Show dependencies for a distribution
 llama stack list-deps <distro-name>
 # Install dependencies
 llama stack list-deps <distro-name> | xargs -L1 uv pip install
 ```
 ### Updating distribution configurations
@ -191,6 +205,7 @@ If you are making changes to the documentation at [https://llamastack.github.io/
 ```bash
 # This rebuilds the documentation pages and the OpenAPI spec.
 cd docs/
 npm install
 npm run gen-api-docs all
 npm run build
--- a/README.md
+++ b/README.md
@ -27,8 +27,11 @@ MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
 huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
 # install dependencies for the distribution
 llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
 # start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
+INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
 # install client to interact with the server
 pip install llama-stack-client
@ -89,7 +92,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:
 ```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
 ```
 ### Overview
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -98,21 +98,30 @@ data:
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
-    metadata_store:
+    storage:
-      type: postgres
+      backends:
-      host: ${env.POSTGRES_HOST:=localhost}
+        kv_default:
-      port: ${env.POSTGRES_PORT:=5432}
+          type: kv_postgres
-      db: ${env.POSTGRES_DB:=llamastack}
+          host: ${env.POSTGRES_HOST:=localhost}
-      user: ${env.POSTGRES_USER:=llamastack}
+          port: ${env.POSTGRES_PORT:=5432}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
+          db: ${env.POSTGRES_DB:=llamastack}
-      table_name: llamastack_kvstore
+          user: ${env.POSTGRES_USER:=llamastack}
-    inference_store:
+          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      type: postgres
+          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-      host: ${env.POSTGRES_HOST:=localhost}
+        sql_default:
-      port: ${env.POSTGRES_PORT:=5432}
+          type: sql_postgres
-      db: ${env.POSTGRES_DB:=llamastack}
+          host: ${env.POSTGRES_HOST:=localhost}
-      user: ${env.POSTGRES_USER:=llamastack}
+          port: ${env.POSTGRES_PORT:=5432}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
+          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
      references:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
    models:
    - metadata:
        embedding_dimension: 768
@ -137,5 +146,4 @@ data:
      port: 8323
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: llama-stack-config
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -95,21 +95,30 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-metadata_store:
+storage:
-  type: postgres
+  backends:
-  host: ${env.POSTGRES_HOST:=localhost}
+    kv_default:
-  port: ${env.POSTGRES_PORT:=5432}
+      type: kv_postgres
-  db: ${env.POSTGRES_DB:=llamastack}
+      host: ${env.POSTGRES_HOST:=localhost}
-  user: ${env.POSTGRES_USER:=llamastack}
+      port: ${env.POSTGRES_PORT:=5432}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
+      db: ${env.POSTGRES_DB:=llamastack}
-  table_name: llamastack_kvstore
+      user: ${env.POSTGRES_USER:=llamastack}
-inference_store:
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  type: postgres
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-  host: ${env.POSTGRES_HOST:=localhost}
+    sql_default:
-  port: ${env.POSTGRES_PORT:=5432}
+      type: sql_postgres
-  db: ${env.POSTGRES_DB:=llamastack}
+      host: ${env.POSTGRES_HOST:=localhost}
-  user: ${env.POSTGRES_USER:=llamastack}
+      port: ${env.POSTGRES_PORT:=5432}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
+      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  references:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
 models:
 - metadata:
    embedding_dimension: 768
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -0,0 +1,8 @@
 These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
 - `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
 - `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
 A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
 These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -0,0 +1,610 @@
 # yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
 organization:
  # Name of your organization or company, used to determine the name of the client
  # and headings.
  name: llama-stack-client
  docs: https://llama-stack.readthedocs.io/en/latest/
  contact: llamastack@meta.com
 security:
  - {}
  - BearerAuth: []
 security_schemes:
  BearerAuth:
    type: http
    scheme: bearer
 # `targets` define the output targets and their customization options, such as
 # whether to emit the Node SDK and what it's package name should be.
 targets:
  node:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-typescript
    publish:
      npm: false
  python:
    package_name: llama_stack_client
    production_repo: llamastack/llama-stack-client-python
    options:
      use_uv: true
    publish:
      pypi: true
    project_name: llama_stack_client
  kotlin:
    reverse_domain: com.llama_stack_client.api
    production_repo: null
    publish:
      maven: false
  go:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-go
    options:
      enable_v2: true
      back_compat_use_shared_package: false
 # `client_settings` define settings for the API client, such as extra constructor
 # arguments (used for authentication), retry behavior, idempotency, etc.
 client_settings:
  default_env_prefix: LLAMA_STACK_CLIENT
  opts:
    api_key:
      type: string
      read_env: LLAMA_STACK_CLIENT_API_KEY
      auth: { security_scheme: BearerAuth }
      nullable: true
 # `environments` are a map of the name of the environment (e.g. "sandbox",
 # "production") to the corresponding url to use.
 environments:
  production: http://any-hosted-llama-stack.com
 # `pagination` defines [pagination schemes] which provides a template to match
 # endpoints and generate next-page and auto-pagination helpers in the SDKs.
 pagination:
  - name: datasets_iterrows
    type: offset
    request:
      dataset_id:
        type: string
      start_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_param
      limit:
        type: integer
    response:
      data:
        type: array
        items:
          type: object
      next_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_start_field
  - name: openai_cursor_page
    type: cursor
    request:
      limit:
        type: integer
      after:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_param
    response:
      data:
        type: array
        items: {}
      has_more:
        type: boolean
      last_id:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_field
 # `resources` define the structure and organziation for your API, such as how
 # methods and models are grouped together and accessed. See the [configuration
 # guide] for more information.
 #
 # [configuration guide]:
 #   https://app.stainlessapi.com/docs/guides/configure#resources
 resources:
  $shared:
    models:
      agent_config: AgentConfig
      interleaved_content_item: InterleavedContentItem
      interleaved_content: InterleavedContent
      param_type: ParamType
      safety_violation: SafetyViolation
      sampling_params: SamplingParams
      scoring_result: ScoringResult
      message: Message
      user_message: UserMessage
      completion_message: CompletionMessage
      tool_response_message: ToolResponseMessage
      system_message: SystemMessage
      tool_call: ToolCall
      query_result: RAGQueryResult
      document: RAGDocument
      query_config: RAGQueryConfig
      response_format: ResponseFormat
  toolgroups:
    models:
      tool_group: ToolGroup
      list_tool_groups_response: ListToolGroupsResponse
    methods:
      register: post /v1/toolgroups
      get: get /v1/toolgroups/{toolgroup_id}
      list: get /v1/toolgroups
      unregister: delete /v1/toolgroups/{toolgroup_id}
  tools:
    methods:
      get: get /v1/tools/{tool_name}
      list:
        endpoint: get /v1/tools
        paginated: false
  tool_runtime:
    models:
      tool_def: ToolDef
      tool_invocation_result: ToolInvocationResult
    methods:
      list_tools:
        endpoint: get /v1/tool-runtime/list-tools
        paginated: false
      invoke_tool: post /v1/tool-runtime/invoke
    subresources:
      rag_tool:
        methods:
          insert: post /v1/tool-runtime/rag-tool/insert
          query: post /v1/tool-runtime/rag-tool/query
  responses:
    models:
      response_object_stream: OpenAIResponseObjectStream
      response_object: OpenAIResponseObject
    methods:
      create:
        type: http
        endpoint: post /v1/responses
        streaming:
          stream_event_model: responses.response_object_stream
          param_discriminator: stream
      retrieve: get /v1/responses/{response_id}
      list:
        type: http
        endpoint: get /v1/responses
      delete:
        type: http
        endpoint: delete /v1/responses/{response_id}
    subresources:
      input_items:
        methods:
          list:
            type: http
            endpoint: get /v1/responses/{response_id}/input_items
  conversations:
    models:
      conversation_object: Conversation
    methods:
      create:
        type: http
        endpoint: post /v1/conversations
      retrieve: get /v1/conversations/{conversation_id}
      update:
        type: http
        endpoint: post /v1/conversations/{conversation_id}
      delete:
        type: http
        endpoint: delete /v1/conversations/{conversation_id}
    subresources:
      items:
        methods:
          get:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
          list:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items
          create:
            type: http
            endpoint: post /v1/conversations/{conversation_id}/items
  inspect:
    models:
      healthInfo: HealthInfo
      providerInfo: ProviderInfo
      routeInfo: RouteInfo
      versionInfo: VersionInfo
    methods:
      health: get /v1/health
      version: get /v1/version
  embeddings:
    models:
      create_embeddings_response: OpenAIEmbeddingsResponse
    methods:
      create: post /v1/embeddings
  chat:
    models:
      chat_completion_chunk: OpenAIChatCompletionChunk
    subresources:
      completions:
        methods:
          create:
            type: http
            endpoint: post /v1/chat/completions
            streaming:
              stream_event_model: chat.chat_completion_chunk
              param_discriminator: stream
          list:
            type: http
            endpoint: get /v1/chat/completions
          retrieve:
            type: http
            endpoint: get /v1/chat/completions/{completion_id}
  completions:
    methods:
      create:
        type: http
        endpoint: post /v1/completions
        streaming:
          param_discriminator: stream
  vector_io:
    models:
      queryChunksResponse: QueryChunksResponse
    methods:
      insert: post /v1/vector-io/insert
      query: post /v1/vector-io/query
  vector_stores:
    models:
      vector_store: VectorStoreObject
      list_vector_stores_response: VectorStoreListResponse
      vector_store_delete_response: VectorStoreDeleteResponse
      vector_store_search_response: VectorStoreSearchResponsePage
    methods:
      create: post /v1/vector_stores
      list:
        endpoint: get /v1/vector_stores
      retrieve: get /v1/vector_stores/{vector_store_id}
      update: post /v1/vector_stores/{vector_store_id}
      delete: delete /v1/vector_stores/{vector_store_id}
      search: post /v1/vector_stores/{vector_store_id}/search
    subresources:
      files:
        models:
          vector_store_file: VectorStoreFileObject
        methods:
          list: get /v1/vector_stores/{vector_store_id}/files
          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
          create: post /v1/vector_stores/{vector_store_id}/files
          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
      file_batches:
        models:
          vector_store_file_batches: VectorStoreFileBatchObject
          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
        methods:
          create: post /v1/vector_stores/{vector_store_id}/file_batches
          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
  models:
    models:
      model: Model
      list_models_response: ListModelsResponse
    methods:
      retrieve: get /v1/models/{model_id}
      list:
        endpoint: get /v1/models
        paginated: false
      register: post /v1/models
      unregister: delete /v1/models/{model_id}
    subresources:
      openai:
        methods:
          list:
            endpoint: get /v1/models
            paginated: false
  providers:
    models:
      list_providers_response: ListProvidersResponse
    methods:
      list:
        endpoint: get /v1/providers
        paginated: false
      retrieve: get /v1/providers/{provider_id}
  routes:
    models:
      list_routes_response: ListRoutesResponse
    methods:
      list:
        endpoint: get /v1/inspect/routes
        paginated: false
  moderations:
    models:
      create_response: ModerationObject
    methods:
      create: post /v1/moderations
  safety:
    models:
      run_shield_response: RunShieldResponse
    methods:
      run_shield: post /v1/safety/run-shield
  shields:
    models:
      shield: Shield
      list_shields_response: ListShieldsResponse
    methods:
      retrieve: get /v1/shields/{identifier}
      list:
        endpoint: get /v1/shields
        paginated: false
      register: post /v1/shields
      delete: delete /v1/shields/{identifier}
  synthetic_data_generation:
    models:
      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
    methods:
      generate: post /v1/synthetic-data-generation/generate
  telemetry:
    models:
      span_with_status: SpanWithStatus
      trace: Trace
      query_spans_response: QuerySpansResponse
      event: Event
      query_condition: QueryCondition
    methods:
      query_traces:
        endpoint: post /v1alpha/telemetry/traces
        skip_test_reason: 'unsupported query params in java / kotlin'
      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
      query_spans:
        endpoint: post /v1alpha/telemetry/spans
        skip_test_reason: 'unsupported query params in java / kotlin'
      query_metrics:
        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
        skip_test_reason: 'unsupported query params in java / kotlin'
      # log_event: post /v1alpha/telemetry/events
      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
      get_trace: get /v1alpha/telemetry/traces/{trace_id}
  scoring:
    methods:
      score: post /v1/scoring/score
      score_batch: post /v1/scoring/score-batch
  scoring_functions:
    methods:
      retrieve: get /v1/scoring-functions/{scoring_fn_id}
      list:
        endpoint: get /v1/scoring-functions
        paginated: false
      register: post /v1/scoring-functions
    models:
      scoring_fn: ScoringFn
      scoring_fn_params: ScoringFnParams
      list_scoring_functions_response: ListScoringFunctionsResponse
  benchmarks:
    methods:
      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
      list:
        endpoint: get /v1alpha/eval/benchmarks
        paginated: false
      register: post /v1alpha/eval/benchmarks
    models:
      benchmark: Benchmark
      list_benchmarks_response: ListBenchmarksResponse
  files:
    methods:
      create: post /v1/files
      list: get /v1/files
      retrieve: get /v1/files/{file_id}
      delete: delete /v1/files/{file_id}
      content: get /v1/files/{file_id}/content
    models:
      file: OpenAIFileObject
      list_files_response: ListOpenAIFileResponse
      delete_file_response: OpenAIFileDeleteResponse
  alpha:
    subresources:
      inference:
        methods:
          rerank: post /v1alpha/inference/rerank
      post_training:
        models:
          algorithm_config: AlgorithmConfig
          post_training_job: PostTrainingJob
          list_post_training_jobs_response: ListPostTrainingJobsResponse
        methods:
          preference_optimize: post /v1alpha/post-training/preference-optimize
          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
        subresources:
          job:
            methods:
              artifacts: get /v1alpha/post-training/job/artifacts
              cancel: post /v1alpha/post-training/job/cancel
              status: get /v1alpha/post-training/job/status
              list:
                endpoint: get /v1alpha/post-training/jobs
                paginated: false
      eval:
        methods:
          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
        subresources:
          jobs:
            methods:
              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
        models:
          evaluate_response: EvaluateResponse
          benchmark_config: BenchmarkConfig
          job: Job
      agents:
        methods:
          create: post /v1alpha/agents
          list: get /v1alpha/agents
          retrieve: get /v1alpha/agents/{agent_id}
          delete: delete /v1alpha/agents/{agent_id}
        models:
          inference_step: InferenceStep
          tool_execution_step: ToolExecutionStep
          tool_response: ToolResponse
          shield_call_step: ShieldCallStep
          memory_retrieval_step: MemoryRetrievalStep
        subresources:
          session:
            models:
              session: Session
            methods:
              list: get /v1alpha/agents/{agent_id}/sessions
              create: post /v1alpha/agents/{agent_id}/session
              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
          steps:
            methods:
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
          turn:
            models:
              turn: Turn
              turn_response_event: AgentTurnResponseEvent
              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
            methods:
              create:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
              resume:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
  beta:
    subresources:
      datasets:
        models:
          list_datasets_response: ListDatasetsResponse
        methods:
          register: post /v1beta/datasets
          retrieve: get /v1beta/datasets/{dataset_id}
          list:
            endpoint: get /v1beta/datasets
            paginated: false
          unregister: delete /v1beta/datasets/{dataset_id}
          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
 settings:
  license: MIT
  unwrap_response_fields: [ data ]
 openapi:
  transformations:
    - command: renameValue
      reason: pydantic reserved name
      args:
        filter:
          only:
            - '$.components.schemas.InferenceStep.properties.model_response'
        rename:
          python:
            property_name: 'inference_model_response'
    # - command: renameValue
    #   reason: pydantic reserved name
    #   args:
    #     filter:
    #       only:
    #         - '$.components.schemas.Model.properties.model_type'
    #     rename:
    #       python:
    #         property_name: 'type'
    - command: mergeObject
      reason: Better return_type using enum
      args:
        target:
          - '$.components.schemas'
        object:
          ReturnType:
            additionalProperties: false
            properties:
              type:
                enum:
                  - string
                  - number
                  - boolean
                  - array
                  - object
                  - json
                  - union
                  - chat_completion_input
                  - completion_input
                  - agent_turn_input
            required:
              - type
            type: object
    - command: replaceProperties
      reason: Replace return type properties with better model (see above)
      args:
        filter:
          only:
            - '$.components.schemas.ScoringFn.properties.return_type'
            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
        value:
          $ref: '#/components/schemas/ReturnType'
    - command: oneOfToAnyOf
      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
    - reason: For better names
      command: extractToRefs
      args:
        ref:
          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
          name: '#/components/schemas/ToolCallOrString'
 # `readme` is used to configure the code snippets that will be rendered in the
 # README.md of various SDKs. In particular, you can change the `headline`
 # snippet's endpoint and the arguments to call it with.
 readme:
  example_requests:
    default:
      type: request
      endpoint: post /v1/chat/completions
      params: &ref_0 {}
    headline:
      type: request
      endpoint: post /v1/models
      params: *ref_0
    pagination:
      type: request
      endpoint: post /v1/chat/completions
      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -0,0 +1,137 @@
 # syntax=docker/dockerfile:1.6
 #
 # This Dockerfile is used to build the Llama Stack container image.
 # Example:
 # docker build \
 #   -f containers/Containerfile \
 #   --build-arg DISTRO_NAME=starter \
 #   --tag llama-stack:starter .
 ARG BASE_IMAGE=python:3.12-slim
 FROM ${BASE_IMAGE}
 ARG INSTALL_MODE="pypi"
 ARG LLAMA_STACK_DIR="/workspace"
 ARG LLAMA_STACK_CLIENT_DIR=""
 ARG PYPI_VERSION=""
 ARG TEST_PYPI_VERSION=""
 ARG KEEP_WORKSPACE=""
 ARG DISTRO_NAME="starter"
 ARG RUN_CONFIG_PATH=""
 ARG UV_HTTP_TIMEOUT=500
 ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 WORKDIR /app
 RUN set -eux; \
    if command -v dnf >/dev/null 2>&1; then \
        dnf -y update && \
        dnf install -y iputils git net-tools wget \
            vim-minimal python3.12 python3.12-pip python3.12-wheel \
            python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
        ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
        ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
        dnf clean all; \
    elif command -v apt-get >/dev/null 2>&1; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            iputils-ping net-tools iproute2 dnsutils telnet \
            curl wget git procps psmisc lsof traceroute bubblewrap \
            gcc g++ && \
        rm -rf /var/lib/apt/lists/*; \
    else \
        echo "Unsupported base image: expected dnf or apt-get" >&2; \
        exit 1; \
    fi
 RUN pip install --no-cache-dir uv
 ENV UV_SYSTEM_PYTHON=1
 ENV INSTALL_MODE=${INSTALL_MODE}
 ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
 ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
 ENV PYPI_VERSION=${PYPI_VERSION}
 ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
 ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
 ENV DISTRO_NAME=${DISTRO_NAME}
 ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
 # Copy the repository so editable installs and run configurations are available.
 COPY . /workspace
 # Install the client package if it is provided
 # NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
 RUN set -eux; \
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
            exit 1; \
        fi; \
        uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
    fi;
 # Install llama-stack
 RUN set -eux; \
    if [ "$INSTALL_MODE" = "editable" ]; then \
        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
            exit 1; \
        fi; \
        uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
        uv pip install --no-cache-dir fastapi libcst; \
        if [ -n "$TEST_PYPI_VERSION" ]; then \
            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
        else \
            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
        fi; \
    else \
        if [ -n "$PYPI_VERSION" ]; then \
            uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
        else \
            uv pip install --no-cache-dir llama-stack; \
        fi; \
    fi;
 # Install the dependencies for the distribution
 RUN set -eux; \
    if [ -z "$DISTRO_NAME" ]; then \
        echo "DISTRO_NAME must be provided" >&2; \
        exit 1; \
    fi; \
    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
    if [ -n "$deps" ]; then \
        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
    fi
 # Cleanup
 RUN set -eux; \
    pip uninstall -y uv; \
    should_remove=1; \
    if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
    if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
    case "$RUN_CONFIG_PATH" in \
        /workspace*) should_remove=0 ;; \
    esac; \
    if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
 RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
 #!/bin/sh
 set -e
 if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
  exec llama stack run "$RUN_CONFIG_PATH" "$@"
 fi
 if [ -n "$DISTRO_NAME" ]; then
  exec llama stack run "$DISTRO_NAME" "$@"
 fi
 exec llama stack run "$@"
 EOF
 RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
 RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
 ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -51,8 +51,8 @@ device: cpu
 You can access the HuggingFace trainer via the `starter` distribution:
 ```bash
-llama stack build --distro starter --image-type venv
+llama stack list-deps starter | xargs -L1 uv pip install
-llama stack run ~/.llama/distributions/starter/starter-run.yaml
+llama stack run starter
 ```
 ### Usage Example
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -175,8 +175,7 @@ llama-stack-client benchmarks register \
 **1. Start the Llama Stack API Server**
 ```bash
-# Build and run a distribution (example: together)
+llama stack list-deps together | xargs -L1 uv pip install
 llama stack build --distro together --image-type venv
 llama stack run together
 ```
@ -209,7 +208,7 @@ The playground works with any Llama Stack distribution. Popular options include:
 <TabItem value="together" label="Together AI">
 ```bash
-llama stack build --distro together --image-type venv
+llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
@ -222,7 +221,7 @@ llama stack run together
 <TabItem value="ollama" label="Ollama (Local)">
 ```bash
-llama stack build --distro ollama --image-type venv
+llama stack list-deps ollama | xargs -L1 uv pip install
 llama stack run ollama
 ```
@ -235,7 +234,7 @@ llama stack run ollama
 <TabItem value="meta-reference" label="Meta Reference">
 ```bash
-llama stack build --distro meta-reference --image-type venv
+llama stack list-deps meta-reference | xargs -L1 uv pip install
 llama stack run meta-reference
 ```
--- a/docs/docs/building_applications/rag.mdx
+++ b/docs/docs/building_applications/rag.mdx
@ -10,358 +10,114 @@ import TabItem from '@theme/TabItem';
 # Retrieval Augmented Generation (RAG)
-RAG enables your applications to reference and recall information from previous interactions or external documents.
+
 RAG enables your applications to reference and recall information from external documents. Llama Stack makes Agentic RAG available through OpenAI's Responses API.
 ## Quick Start
 ### 1. Start the Server
 In one terminal, start the Llama Stack server:
 ```bash
 llama stack list-deps starter | xargs -L1 uv pip install
 llama stack run starter
 ```
 ### 2. Connect with OpenAI Client
 In another terminal, use the standard OpenAI client with the Responses API:
 ```python
 import io, requests
 from openai import OpenAI
 url = "https://www.paulgraham.com/greatwork.html"
 client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
 # Create vector store - auto-detects default embedding model
 vs = client.vector_stores.create()
 response = requests.get(url)
 pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
 file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id
 client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id)
 resp = client.responses.create(
    model="gpt-4o",
    input="How do you do great work? Use the existing knowledge_search tool.",
    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
    include=["file_search_call.results"],
 )
 print(resp.output[-1].content[-1].text)
 ```
 Which should give output like:
 ```
 Doing great work is about more than just hard work and ambition; it involves combining several elements:
 1. **Pursue What Excites You**: Engage in projects that are both ambitious and exciting to you. It's important to work on something you have a natural aptitude for and a deep interest in.
 2. **Explore and Discover**: Great work often feels like a blend of discovery and creation. Focus on seeing possibilities and let ideas take their natural shape, rather than just executing a plan.
 3. **Be Bold Yet Flexible**: Take bold steps in your work without over-planning. An adaptable approach that evolves with new ideas can often lead to breakthroughs.
 4. **Work on Your Own Projects**: Develop a habit of working on projects of your own choosing, as these often lead to great achievements. These should be projects you find exciting and that challenge you intellectually.
 5. **Be Earnest and Authentic**: Approach your work with earnestness and authenticity. Trying to impress others with affectation can be counterproductive, as genuine effort and intellectual honesty lead to better work outcomes.
 6. **Build a Supportive Environment**: Work alongside great colleagues who inspire you and enhance your work. Surrounding yourself with motivating individuals creates a fertile environment for great work.
 7. **Maintain High Morale**: High morale significantly impacts your ability to do great work. Stay optimistic and protect your mental well-being to maintain progress and momentum.
 8. **Balance**: While hard work is essential, overworking can lead to diminishing returns. Balance periods of intensive work with rest to sustain productivity over time.
 This approach shows that great work is less about following a strict formula and more about aligning your interests, ambition, and environment to foster creativity and innovation.
 ```
 ## Architecture Overview
-Llama Stack organizes the APIs that enable RAG into three layers:
+Llama Stack provides OpenAI-compatible RAG capabilities through:
-1. **Lower-Level APIs**: Deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon)
+- **Vector Stores API**: OpenAI-compatible vector storage with automatic embedding model detection
-2. **RAG Tool**: A first-class tool as part of the [Tools API](./tools) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly
+- **Files API**: Document upload and processing using OpenAI's file format
-3. **Agents API**: The top-level [Agents API](./agent) that allows you to create agents that can use the tools to answer questions, perform tasks, and more
+- **Responses API**: Enhanced chat completions with agentic tool calling via file search
-![RAG System Architecture](/img/rag.png)
+## Configuring Default Embedding Models
-The RAG system uses lower-level storage for different types of data:
+To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:
 - **Vector IO**: For semantic search and retrieval
 - **Key-Value and Relational IO**: For structured data storage
-:::info[Future Storage Types]
+```yaml
-We may add more storage types like Graph IO in the future.
+vector_stores:
-:::
+  default_provider_id: faiss
-
+  default_embedding_model:
-## Setting up Vector Databases
+    provider_id: sentence-transformers
-
+    model_id: nomic-ai/nomic-embed-text-v1.5
 For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. Ollama is an LLM runtime that allows you to run Llama models locally.
 Here's how to set up a vector database for RAG:
 ```python
 # Create HTTP client
 import os
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
 # Register a vector database
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="nomic-embed-text-v1.5",
    embedding_dimension=768,
    provider_id="faiss",
 )
 ```
-## Document Ingestion
+With this configuration:
 - `client.vector_stores.create()` works without requiring embedding model or provider parameters
 - The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
 - The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
 - The `default_provider_id` specifies which vector storage backend to use
 - The `default_embedding_model` specifies both the inference provider and model for embeddings
-You can ingest documents into the vector database using two methods: directly inserting pre-chunked documents or using the RAG Tool.
+## Vector Store Operations
-### Direct Document Insertion
+### Creating Vector Stores
-<Tabs>
+You can create vector stores with automatic or explicit embedding model selection:
 <TabItem value="basic" label="Basic Insertion">
 ```python
-# You can insert a pre-chunked document directly into the vector db
+# Automatic - uses default configured embedding model and vector store provider
-chunks = [
+vs = client.vector_stores.create()
    {
        "content": "Your document text here",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "doc1",
            "author": "Jane Doe",
        },
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
-</TabItem>
+# Explicit - specify embedding model and/or provider when you need specific ones
-<TabItem value="embeddings" label="With Precomputed Embeddings">
+vs = client.vector_stores.create(
-
+    extra_body={
-If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you want to customize the ingestion process.
+        "provider_id": "faiss",  # Optional: specify vector store provider
-
+        "embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
-```python
+        "embedding_dimension": 768  # Optional: will be auto-detected if not provided
-chunks_with_embeddings = [
+    }
    {
        "content": "First chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "introduction"},
    },
    {
        "content": "Second chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "methodology"},
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
 ```
 :::warning[Embedding Dimensions]
 When providing precomputed embeddings, ensure the embedding dimension matches the `embedding_dimension` specified when registering the vector database.
 :::
 </TabItem>
 </Tabs>
 ### Document Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
    vector_db_id=vector_db_id,
    query="What do you know about..."
 )
 ```
 ## Using the RAG Tool
 :::danger[Deprecation Notice]
 The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
 :::
 A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples).
 ### OpenAI API Integration & Migration
 The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
 - **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
 - **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
 - **Error Resilience**: When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
 ### Migration Path
 We recommend migrating to the OpenAI-compatible Search API for:
 1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
 2. **Future-Proof**: Continued support and feature development
 3. **Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
 The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
 ### RAG Tool Example
 ```python
 from llama_stack_client import RAGDocument
 urls = ["memory_optimizations.rst", "chat.rst", "llama3.rst"]
 documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
 ]
 client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
 )
 # Query documents
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
 )
 ```
 ### Custom Context Configuration
 You can configure how the RAG tool adds metadata to the context if you find it useful for your application:
 ```python
 # Query documents with custom template
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
    query_config={
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
 )
 ```
 ## Building RAG-Enhanced Agents
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
 ### Agent with Knowledge Search
 ```python
 from llama_stack_client import Agent
 # Create agent with memory
 agent = Agent(
    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {
                "vector_db_ids": [vector_db_id],
                # Defaults
                "query_config": {
                    "chunk_size_in_tokens": 512,
                    "chunk_overlap_in_tokens": 0,
                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
                },
            },
        }
    ],
 )
 session_id = agent.create_session("rag_session")
 # Ask questions about documents in the vector db, and the agent will query the db to answer the question.
 response = agent.create_turn(
    messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
    session_id=session_id,
 )
 ```
 :::tip[Agent Instructions]
 The `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
 :::
 ### Document-Aware Conversations
 You can also pass documents along with the user's message and ask questions about them:
 ```python
 # Initial document ingestion
 response = agent.create_turn(
    messages=[
        {"role": "user", "content": "I am providing some documents for reference."}
    ],
    documents=[
        {
            "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
            "mime_type": "text/plain",
        }
    ],
    session_id=session_id,
 )
 # Query with RAG
 response = agent.create_turn(
    messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
    session_id=session_id,
 )
 ```
 ### Viewing Agent Responses
 You can print the response with the following:
 ```python
 from llama_stack_client import AgentEventLogger
 for log in AgentEventLogger().log(response):
    log.print()
 ```
 ## Vector Database Management
 ### Unregistering Vector DBs
 If you need to clean up and unregister vector databases, you can do so as follows:
 <Tabs>
 <TabItem value="single" label="Single Database">
 ```python
 # Unregister a specified vector database
 vector_db_id = "my_vector_db_id"
 print(f"Unregistering vector database: {vector_db_id}")
 client.vector_dbs.unregister(vector_db_id=vector_db_id)
 ```
 </TabItem>
 <TabItem value="all" label="All Databases">
 ```python
 # Unregister all vector databases
 for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```
 </TabItem>
 </Tabs>
 ## Best Practices
 ### 🎯 **Document Chunking**
 - Use appropriate chunk sizes (512 tokens is often a good starting point)
 - Consider overlap between chunks for better context preservation
 - Experiment with different chunking strategies for your content type
 ### 🔍 **Embedding Strategy**
 - Choose embedding models that match your domain
 - Consider the trade-off between embedding dimension and performance
 - Test different embedding models for your specific use case
 ### 📊 **Query Optimization**
 - Use specific, well-formed queries for better retrieval
 - Experiment with different search strategies
 - Consider hybrid approaches (keyword + semantic search)
 ### 🛡️ **Error Handling**
 - Implement proper error handling for failed document processing
 - Monitor ingestion success rates
 - Have fallback strategies for retrieval failures
 ## Appendix
 ### More RAGDocument Examples
 Here are various ways to create RAGDocument objects for different content types:
 ```python
 from llama_stack_client import RAGDocument
 import base64
 # File URI
 RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
 # Plain text
 RAGDocument(document_id="num-1", content="plain text")
 # Explicit text input
 RAGDocument(
    document_id="num-2",
    content={
        "type": "text",
        "text": "plain text input",
    },  # for inputs that should be treated as text explicitly
 )
 # Image from URL
 RAGDocument(
    document_id="num-3",
    content={
        "type": "image",
        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
    },
 )
 # Base64 encoded image
 B64_ENCODED_IMAGE = base64.b64encode(
    requests.get(
        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
    ).content
 )
 RAGDocument(
    document_id="num-4",
    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
 )
 ```
 For more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -10,58 +10,8 @@ import TabItem from '@theme/TabItem';
 # Telemetry
-The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications.
+The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.
 ## Event Types
 The telemetry system supports three main types of events:
 <Tabs>
 <TabItem value="unstructured" label="Unstructured Logs">
 Free-form log messages with severity levels for general application logging:
 ```python
 unstructured_log_event = UnstructuredLogEvent(
    message="This is a log message",
    severity=LogSeverity.INFO
 )
 ```
 </TabItem>
 <TabItem value="metrics" label="Metric Events">
 Numerical measurements with units for tracking performance and usage:
 ```python
 metric_event = MetricEvent(
    metric="my_metric",
    value=10,
    unit="count"
 )
 ```
 </TabItem>
 <TabItem value="structured" label="Structured Logs">
 System events like span start/end that provide structured operation tracking:
 ```python
 structured_log_event = SpanStartPayload(
    name="my_span",
    parent_span_id="parent_span_id"
 )
 ```
 </TabItem>
 </Tabs>
 ## Spans and Traces
 - **Spans**: Represent individual operations with timing information and hierarchical relationships
 - **Traces**: Collections of related spans that form a complete request flow across your application
 This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.
 ## Automatic Metrics Generation
@ -129,21 +79,6 @@ Send events to an OpenTelemetry Collector for integration with observability pla
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics
 </TabItem>
 <TabItem value="sqlite" label="SQLite">
 Store events in a local SQLite database for direct querying:
 **Use Cases:**
 - Local development and debugging
 - Custom analytics and reporting
 - Offline analysis of application behavior
 **Features:**
 - Direct SQL querying capabilities
 - Persistent local storage
 - No external dependencies
 </TabItem>
 <TabItem value="console" label="Console">
@ -174,9 +109,8 @@ telemetry:
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
-      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
      sqlite_db_path: "/path/to/telemetry.db"
 ```
 ### Environment Variables
@ -185,7 +119,7 @@ Configure telemetry behavior using environment variables:
 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
+- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)
 ### Quick Setup: Complete Telemetry Stack
@ -248,37 +182,10 @@ Forward metrics to other observability systems:
 </TabItem>
 </Tabs>
 ## SQLite Querying
 The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
 ### Example Queries
 ```sql
 -- Query recent traces
 SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
 -- Analyze span durations
 SELECT name, AVG(duration_ms) as avg_duration
 FROM spans
 GROUP BY name
 ORDER BY avg_duration DESC;
 -- Find slow operations
 SELECT * FROM spans
 WHERE duration_ms > 1000
 ORDER BY duration_ms DESC;
 ```
 :::tip[Advanced Analytics]
 Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
 :::
 ## Best Practices
 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
 - Combine multiple sinks for development (console + SQLite)
 - Set up alerts on key metrics like token usage and error rates
 ### 📊 **Metrics Analysis**
@ -293,45 +200,8 @@ Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stac
 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
 - Configure appropriate retention policies for SQLite
 - Ensure proper network access to OpenTelemetry collectors
 ## Integration Examples
 ### Basic Telemetry Setup
 ```python
 from llama_stack_client import LlamaStackClient
 # Client with telemetry headers
 client = LlamaStackClient(
    base_url="http://localhost:8000",
    extra_headers={
        "X-Telemetry-Service": "my-ai-app",
        "X-Telemetry-Version": "1.0.0"
    }
 )
 # All API calls will be automatically traced
 response = client.chat.completions.create(
    model="meta-llama/Llama-3.2-3B-Instruct",
    messages=[{"role": "user", "content": "Hello!"}]
 )
 ```
 ### Custom Telemetry Context
 ```python
 # Add custom span attributes for better tracking
 with tracer.start_as_current_span("custom_operation") as span:
    span.set_attribute("user_id", "user123")
    span.set_attribute("operation_type", "chat_completion")
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[{"role": "user", "content": "Hello!"}]
    )
 ```
 ## Related Resources
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -62,6 +62,10 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in
 When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
 ### Deprecated APIs
 Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
 ### API Stability vs. Provider Stability
 The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -158,17 +158,16 @@ under the LICENSE file in the root directory of this source tree.
 Some tips about common tasks you work on while contributing to Llama Stack:
-### Using `llama stack build`
+### Setup for development
 Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
 Example:
 ```bash
 cd work/
 git clone https://github.com/meta-llama/llama-stack.git
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+uv run llama stack list-deps <distro-name> | xargs -L1 uv pip install
 # (Optional) If you are developing the llama-stack-client-python package, you can add it as an editable package.
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 uv add --editable ../llama-stack-client-python
 ```
 ### Updating distribution configurations
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@ -67,7 +67,7 @@ def get_base_url(self) -> str:
 ## Testing the Provider
-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.
 ### 1. Integration Testing
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -5,225 +5,80 @@ sidebar_label: Build your own Distribution
 sidebar_position: 3
 ---
-This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.
+This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.
 ### Explore existing distributions
-### Setting your log level
+All first-party distributions live under `llama_stack/distributions/`. Each directory contains:
-In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:
+- `build.yaml` – the distribution specification (providers, additional dependencies, optional external provider directories).
 - `run.yaml` – sample run configuration (when provided).
 - Documentation fragments that power this site.
-`LLAMA_STACK_LOGGING=server=debug;core=info`
+Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.
 Where each category in the following list:
 - all
 - core
 - server
 - router
 - inference
 - agents
 - safety
 - eval
 - tools
 - client
 Can be set to any of the following log levels:
 - debug
 - info
 - warning
 - error
 - critical
 The default global log level is `info`. `all` sets the log level for all components.
 A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
 ### Llama Stack Build
 In order to build your own distribution, we recommend you clone the `llama-stack` repository.
 ```
 git clone git@github.com:meta-llama/llama-stack.git
 cd llama-stack
 pip install -e .
 ```
 Use the CLI to build your distribution.
 The main points to consider are:
 1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
 2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
 3. **Config** - Do you want to use a pre-existing config file to build your distribution?
 ```
 llama stack build -h
 usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
                         [--run] [--providers PROVIDERS]
 Build a Llama stack container
 options:
  -h, --help            show this help message and exit
  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
                        enter information interactively (default: None)
  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
                        None)
  --distro DISTRIBUTION, --distribution DISTRIBUTION
                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
  --list-distros, --list-distributions
                        Show the available distributions for building a Llama Stack distribution (default: False)
  --image-type {container,venv}
                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
  --image-name IMAGE_NAME
                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
                        None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
  --providers PROVIDERS
                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
                        API. (default: None)
 ```
 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 <Tabs>
-<TabItem value="template" label="Building from a template">
+<TabItem value="container" label="Building a container">
 To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
-The following command will allow you to see the available templates and their corresponding providers.
+Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
-```
+
-llama stack build --list-templates
+```bash
 docker build . \
  -f containers/Containerfile \
  --build-arg DISTRO_NAME=starter \
  --tag llama-stack:starter
 ```
-```
+Handy build arguments:
 ------------------------------+-----------------------------------------------------------------------------+
 | Template Name                | Description                                                                 |
 +------------------------------+-----------------------------------------------------------------------------+
 | watsonx                      | Use watsonx for running LLM inference                                       |
 +------------------------------+-----------------------------------------------------------------------------+
 | vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
 +------------------------------+-----------------------------------------------------------------------------+
 | together                     | Use Together.AI for running LLM inference                                   |
 +------------------------------+-----------------------------------------------------------------------------+
 | tgi                          | Use (an external) TGI server for running LLM inference                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | starter                      | Quick start template for running Llama Stack with several popular providers |
 +------------------------------+-----------------------------------------------------------------------------+
 | sambanova                    | Use SambaNova for running LLM inference and safety                          |
 +------------------------------+-----------------------------------------------------------------------------+
 | remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
 +------------------------------+-----------------------------------------------------------------------------+
 | postgres-demo                | Quick start template for running Llama Stack with several popular providers |
 +------------------------------+-----------------------------------------------------------------------------+
 | passthrough                  | Use Passthrough hosted llama-stack endpoint for LLM inference               |
 +------------------------------+-----------------------------------------------------------------------------+
 | open-benchmark               | Distribution for running open benchmarks                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | ollama                       | Use (an external) Ollama server for running LLM inference                   |
 +------------------------------+-----------------------------------------------------------------------------+
 | nvidia                       | Use NVIDIA NIM for running LLM inference, evaluation and safety             |
 +------------------------------+-----------------------------------------------------------------------------+
 | meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
 +------------------------------+-----------------------------------------------------------------------------+
 | llama_api                    | Distribution for running e2e tests in CI                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 | hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 | groq                         | Use Groq for running LLM inference                                          |
 +------------------------------+-----------------------------------------------------------------------------+
 | fireworks                    | Use Fireworks.AI for running LLM inference                                  |
 +------------------------------+-----------------------------------------------------------------------------+
 | experimental-post-training   | Experimental template for post training                                     |
 +------------------------------+-----------------------------------------------------------------------------+
 | dell                         | Dell's distribution of Llama Stack. TGI inference via Dell's custom         |
 |                              | container                                                                   |
 +------------------------------+-----------------------------------------------------------------------------+
 | ci-tests                     | Distribution for running e2e tests in CI                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
 +------------------------------+-----------------------------------------------------------------------------+
 ```
-You may then pick a template to build your distribution with providers fitted to your liking.
+- `DISTRO_NAME` – distribution directory name (defaults to `starter`).
 - `RUN_CONFIG_PATH` – absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
 - `INSTALL_MODE=editable` – install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
 - `LLAMA_STACK_CLIENT_DIR` – optional editable install of the Python client.
 - `PYPI_VERSION` / `TEST_PYPI_VERSION` – pin specific releases when not using editable installs.
 - `KEEP_WORKSPACE=1` – retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).
-For example, to build a distribution with TGI as the inference provider, you can run:
+Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.
 ```
 $ llama stack build --distro starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
 ```{tip}
 The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
 </TabItem>
-<TabItem value="scratch" label="Building from Scratch">
+<TabItem value="external" label="Building with external providers">
-If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
+External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.
-It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
+1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
-```
+2. Update `build.yaml` with the directory and provider entries.
-llama stack build
+3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.
-> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
+Example `build.yaml` excerpt for a custom Ollama provider:
 > Enter the image type you want your Llama Stack to be built as (container or venv): venv
 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
 Tip: use <TAB> to see options for the providers.
 > Enter provider for API inference: inline::meta-reference
 > Enter provider for API safety: inline::llama-guard
 > Enter provider for API agents: inline::meta-reference
 > Enter provider for API memory: inline::faiss
 > Enter provider for API datasetio: inline::meta-reference
 > Enter provider for API scoring: inline::meta-reference
 > Enter provider for API eval: inline::meta-reference
 > Enter provider for API telemetry: inline::meta-reference
 > (Optional) Enter a short description for your Llama Stack:
 You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
 ```
 </TabItem>
 <TabItem value="config" label="Building from a pre-existing build config file">
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
 - The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
 ```
 llama stack build --config llama_stack/distributions/starter/build.yaml
 ```
 </TabItem>
 <TabItem value="external" label="Building with External Providers">
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
 To build a distribution with external providers, you need to:
 1. Configure the `external_providers_dir` in your build configuration file:
 ```yaml
 # Example my-external-stack.yaml with external providers
 version: '2'
 distribution_spec:
  description: Custom distro for CI tests
  providers:
    inference:
-    - remote::custom_ollama
+      - remote::custom_ollama
-# Add more providers as needed
+external_providers_dir: /workspace/providers.d
-image_type: container
+```
-image_name: ci-test
+
-# Path to external provider implementations
+Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
-external_providers_dir: ~/.llama/providers.d
+
 ```python
 from llama_stack.providers.datatypes import ProviderSpec
 def get_provider_spec() -> ProviderSpec:
    return ProviderSpec(
        provider_type="remote::custom_ollama",
        module="llama_stack_ollama_provider",
        config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
        pip_packages=[
            "ollama",
            "aiohttp",
            "llama-stack-provider-ollama",
        ],
    )
 ```
 Here's an example for a custom Ollama provider:
@ -232,9 +87,9 @@ Here's an example for a custom Ollama provider:
 adapter:
  adapter_type: custom_ollama
  pip_packages:
-  - ollama
+    - ollama
-  - aiohttp
+    - aiohttp
-  - llama-stack-provider-ollama # This is the provider package
+    - llama-stack-provider-ollama  # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
@ -245,53 +100,22 @@ The `pip_packages` section lists the Python packages required by the provider, a
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).
-2. Build your distribution using the config file:
+For deeper guidance, see the [External Providers documentation](../providers/external/).
 ```
 llama stack build --config my-external-stack.yaml
 ```
 For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
 </TabItem>
-<TabItem value="container" label="Building Container">
+</Tabs>
-:::tip Podman Alternative
+### Run your stack server
 Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
 :::
-To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
+After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:
 ```
 llama stack build --distro starter --image-type container
 ```
 ```
 $ llama stack build --distro starter --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
 ```
 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```
 Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
 ```bash
 export INFERENCE_MODEL="llama3.2:3b"
 export LLAMA_STACK_PORT=8321
 mkdir -p ~/.llama
 ```
 After this step is successful, you should be able to find the built container image and test it with the below Docker command:
 ```
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -e INFERENCE_MODEL=$INFERENCE_MODEL \
  -e OLLAMA_URL=http://host.docker.internal:11434 \
-  localhost/distribution-ollama:dev \
+  llama-stack:starter \
  --port $LLAMA_STACK_PORT
 ```
@ -311,131 +135,14 @@ Here are the docker flags and their uses:
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
 </TabItem>
 </Tabs>
-### Running your Stack server
+If you prepared a custom run config, mount it into the container and reference it explicitly:
 Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
 ```bash
 docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $(pwd)/run.yaml:/app/run.yaml \
  llama-stack:starter \
  /app/run.yaml
 ```
 llama stack run -h
 usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
                       [--image-type {venv}] [--enable-ui]
                       [config | distro]
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
 positional arguments:
  config | distro       Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)
 options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
  --image-type {venv}
                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
  --enable-ui           Start the UI server (default: False)
 ```
 **Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
 ```
 # Start using template name
 llama stack run tgi
 # Start using config file
 llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```
 ```
 $ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 Serving API inspect
 GET /health
 GET /providers/list
 GET /routes/list
 Serving API inference
 POST /inference/chat_completion
 POST /inference/completion
 POST /inference/embeddings
 ...
 Serving API agents
 POST /agents/create
 POST /agents/session/create
 POST /agents/turn/create
 POST /agents/delete
 POST /agents/session/delete
 POST /agents/session/get
 POST /agents/step/get
 POST /agents/turn/get
 Listening on ['::', '0.0.0.0']:8321
 INFO:     Started server process [2935911]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
 ### Listing Distributions
 Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
 ```
 llama stack list -h
 usage: llama stack list [-h]
 list the build stacks
 options:
  -h, --help  show this help message and exit
 ```
 Example Usage
 ```
 llama stack list
 ```
 ```
 ------------------------------+-----------------------------------------------------------------+--------------+------------+
 | Stack Name                  | Path                                                            | Build Config | Run Config |
 +------------------------------+-----------------------------------------------------------------------------+--------------+
 | together                    | ~/.llama/distributions/together                                 | Yes          | No         |
 +------------------------------+-----------------------------------------------------------------------------+--------------+
 | bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
 +------------------------------+-----------------------------------------------------------------------------+--------------+
 | starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
 +------------------------------+-----------------------------------------------------------------------------+--------------+
 | remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
 +------------------------------+-----------------------------------------------------------------------------+--------------+
 ```
 ### Removing a Distribution
 Use the remove command to delete a distribution you've previously built.
 ```
 llama stack rm -h
 usage: llama stack rm [-h] [--all] [name]
 Remove the build stack
 positional arguments:
  name        Name of the stack to delete (default: None)
 options:
  -h, --help  show this help message and exit
  --all, -a   Delete all stacks (use with caution) (default: False)
 ```
 Example
 ```
 llama stack rm llamastack-test
 ```
 To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
 ### Troubleshooting
 If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -44,18 +44,32 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
+      persistence:
-        type: sqlite
+        agent_state:
-        namespace: null
+          backend: kv_default
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
+          namespace: agents
        responses:
          backend: sql_default
          table_name: responses
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
-metadata_store:
+storage:
-  namespace: null
+  backends:
-  type: sqlite
+    kv_default:
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
+      type: kv_sqlite
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
    sql_default:
      type: sql_sqlite
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
  references:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -12,7 +12,7 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --distro starter --image-type venv
+llama stack list-deps starter | xargs -L1 uv pip install
 ```
 ```python
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -1,56 +1,155 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
+  stack_run_config.yaml: |
-    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
+    version: '2'
-    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
+    image_name: kubernetes-demo
-    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
+    apis:
-    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
+    - agents
-    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
+    - inference
-    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
+    - files
-    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
+    - safety
-    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
+    - telemetry
-    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
+    - tool_runtime
-    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
+    - vector_io
-    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
+    providers:
-    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
+      inference:
-    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
+      - provider_id: vllm-inference
-    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
+        provider_type: remote::vllm
-    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
+        config:
-    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
+          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
+          api_token: ${env.VLLM_API_TOKEN:=fake}
-    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
+      - provider_id: vllm-safety
-    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
+        provider_type: remote::vllm
-    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
+        config:
-    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
+          api_token: ${env.VLLM_API_TOKEN:=fake}
-    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
+      - provider_id: sentence-transformers
-    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
+        provider_type: inline::sentence-transformers
-    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
+        config: {}
-    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
+      vector_io:
-    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
+      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
+        provider_type: remote::chromadb
-    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
+        config:
-    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
+          url: ${env.CHROMADB_URL:=}
-    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
+          kvstore:
-    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
+            type: postgres
-    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+            host: ${env.POSTGRES_HOST:=localhost}
-    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
+            port: ${env.POSTGRES_PORT:=5432}
-    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
+            db: ${env.POSTGRES_DB:=llamastack}
-    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
+            user: ${env.POSTGRES_USER:=llamastack}
-    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
-    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
+      files:
-    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
+      - provider_id: meta-reference-files
-    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
+        provider_type: inline::localfs
-    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
+        config:
-    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
+          metadata_store:
-    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
+            type: sqlite
-    \     type: github_token\n"
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
        config:
          excluded_categories: []
      agents:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          persistence_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
        config:
          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: tavily-search
        provider_type: remote::tavily-search
        config:
          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: rag-runtime
        provider_type: inline::rag-runtime
        config: {}
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
    storage:
      backends:
        kv_default:
          type: kv_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
        sql_default:
          type: sql_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
      references:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
    models:
    - metadata:
        embedding_dimension: 768
      model_id: nomic-embed-text-v1.5
      provider_id: sentence-transformers
      model_type: embedding
    - metadata: {}
      model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - metadata: {}
      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
    benchmarks: []
    tool_groups:
    - toolgroup_id: builtin::websearch
      provider_id: tavily-search
    - toolgroup_id: builtin::rag
      provider_id: rag-runtime
    server:
      port: 8321
      auth:
        provider_config:
          type: github_token
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: llama-stack-config
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -93,21 +93,30 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-metadata_store:
+storage:
-  type: postgres
+  backends:
-  host: ${env.POSTGRES_HOST:=localhost}
+    kv_default:
-  port: ${env.POSTGRES_PORT:=5432}
+      type: kv_postgres
-  db: ${env.POSTGRES_DB:=llamastack}
+      host: ${env.POSTGRES_HOST:=localhost}
-  user: ${env.POSTGRES_USER:=llamastack}
+      port: ${env.POSTGRES_PORT:=5432}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
+      db: ${env.POSTGRES_DB:=llamastack}
-  table_name: llamastack_kvstore
+      user: ${env.POSTGRES_USER:=llamastack}
-inference_store:
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  type: postgres
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-  host: ${env.POSTGRES_HOST:=localhost}
+    sql_default:
-  port: ${env.POSTGRES_PORT:=5432}
+      type: sql_postgres
-  db: ${env.POSTGRES_DB:=llamastack}
+      host: ${env.POSTGRES_HOST:=localhost}
-  user: ${env.POSTGRES_USER:=llamastack}
+      port: ${env.POSTGRES_PORT:=5432}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
+      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  references:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
 models:
 - metadata:
    embedding_dimension: 768
--- a/docs/docs/distributions/ondevice_distro/android_sdk.md
+++ b/docs/docs/distributions/ondevice_distro/android_sdk.md
@ -59,7 +59,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 uv venv starter --python 3.12
 source starter/bin/activate  # On Windows: starter\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --distro starter --image-type venv
+llama stack list-deps starter | xargs -L1 uv pip install
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run starter --port 5050
 ```
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -166,10 +166,10 @@ docker run \
 ### Via venv
-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Install the distribution dependencies before launching:
 ```bash
-llama stack build --distro dell --image-type venv
+llama stack list-deps dell | xargs -L1 uv pip install
 INFERENCE_MODEL=$INFERENCE_MODEL \
 DEH_URL=$DEH_URL \
 CHROMA_URL=$CHROMA_URL \
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -21,7 +21,6 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | inference | `inline::meta-reference` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -82,10 +81,10 @@ docker run \
 ### Via venv
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have the Llama Stack CLI available.
 ```bash
-llama stack build --distro meta-reference-gpu --image-type venv
+llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
 INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 8321
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -16,7 +16,6 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `inline::rag-runtime` |
 | vector_io | `inline::faiss` |
@ -137,11 +136,11 @@ docker run \
 ### Via venv
-If you've set up your local development environment, you can also build the image using your local virtual environment.
+If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-llama stack build --distro nvidia --image-type venv
+llama stack list-deps nvidia | xargs -L1 uv pip install
 NVIDIA_API_KEY=$NVIDIA_API_KEY \
 INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -119,7 +119,7 @@ The following environment variables can be configured:
 ### Telemetry Configuration
 - `OTEL_SERVICE_NAME`: OpenTelemetry service name
- `TELEMETRY_SINKS`: Telemetry sinks (default: `console,sqlite`)
+- `TELEMETRY_SINKS`: Telemetry sinks (default: `[]`)
 ## Enabling Providers
@ -169,7 +169,11 @@ docker run \
 Ensure you have configured the starter distribution using the environment variables explained above.
 ```bash
-uv run --with llama-stack llama stack build --distro starter --image-type venv --run
+# Install dependencies for the starter distribution
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
 # Run the server
 uv run --with llama-stack llama stack run starter
 ```
 ## Example Usage
@ -216,7 +220,6 @@ The starter distribution uses SQLite for local storage of various components:
 - **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
 - **Agents store**: `~/.llama/distributions/starter/agents_store.db`
 - **Responses store**: `~/.llama/distributions/starter/responses_store.db`
 - **Trace store**: `~/.llama/distributions/starter/trace_store.db`
 - **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
 - **Dataset I/O stores**: Various HuggingFace and local filesystem stores
--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@ -23,6 +23,17 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.
 ## Configure logging
 Control log output via environment variables before starting the server.
 - `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
 - Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
 - Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
 - `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
 Export these variables prior to running `llama stack run`, launching a container, or starting the server through any other pathway.
 ```{toctree}
 :maxdepth: 1
 :hidden:
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -4,65 +4,24 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
-vector_db_id = "my_demo_vector_db"
+import io, requests
-client = LlamaStackClient(base_url="http://localhost:8321")
+from openai import OpenAI
-models = client.models.list()
+url="https://www.paulgraham.com/greatwork.html"
 client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
-# Select the first LLM and first embedding models
+vs = client.vector_stores.create()
-model_id = next(m for m in models if m.model_type == "llm").identifier
+response = requests.get(url)
-embedding_model_id = (
+pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-    em := next(m for m in models if m.model_type == "embedding")
+uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-).identifier
+client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
 embedding_dimension = em.metadata["embedding_dimension"]
-vector_db = client.vector_dbs.register(
+resp = client.responses.create(
-    vector_db_id=vector_db_id,
+    model="openai/gpt-4o",
-    embedding_model=embedding_model_id,
+    input="How do you do great work? Use the existing knowledge_search tool.",
-    embedding_dimension=embedding_dimension,
+    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    provider_id="faiss",
+    include=["file_search_call.results"],
 )
 vector_db_id = vector_db.identifier
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
    document_id="document_1",
    content=source,
    mime_type="text/html",
    metadata={},
 )
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
 )
 agent = Agent(
    client,
    model=model_id,
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
 )
-prompt = "How do you do great work?"
+print(resp)
 print("prompt>", prompt)
 use_stream = True
 response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session"),
    stream=use_stream,
 )
 # Only call `AgentEventLogger().log(response)` for streaming responses.
 if use_stream:
    for log in AgentEventLogger().log(response):
        log.print()
 else:
    print(response)
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -58,15 +58,19 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
 <Tabs>
 <TabItem value="venv" label="Using venv">
-You can use Python to build and run the Llama Stack server, which is useful for testing and development.
+You can use Python to install dependencies and run the Llama Stack server, which is useful for testing and development.
 Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
 which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
-Now let's build and run the Llama Stack config for Ollama.
+Now let's install dependencies and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
 ```bash
-llama stack build --distro starter --image-type venv --run
+# Install dependencies for the starter distribution
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
 # Run the server
 llama stack run starter
 ```
 </TabItem>
 <TabItem value="container" label="Using a Container">
@ -304,7 +308,7 @@ stream = agent.create_turn(
 for event in AgentEventLogger().log(stream):
    event.print()
 ```
-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python agent.py
--- a/docs/docs/getting_started/quickstart.mdx
+++ b/docs/docs/getting_started/quickstart.mdx
@ -24,111 +24,62 @@ ollama run llama3.2:3b --keepalive 60m
 #### Step 2: Run the Llama Stack server
-We will use `uv` to run the Llama Stack server.
+We will use `uv` to install dependencies and run the Llama Stack server.
 ```bash
-OLLAMA_URL=http://localhost:11434 \
+# Install dependencies for the starter distribution
-  uv run --with llama-stack llama stack build --distro starter --image-type venv --run
+uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
 # Run the server
 OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.
-```python title="demo_script.py"
+```python
-# Copyright (c) Meta Platforms, Inc. and affiliates.
+import io, requests
-# All rights reserved.
+from openai import OpenAI
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
+url="https://www.paulgraham.com/greatwork.html"
 client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
-vector_db_id = "my_demo_vector_db"
+vs = client.vector_stores.create()
-client = LlamaStackClient(base_url="http://localhost:8321")
+response = requests.get(url)
 pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
 uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
 client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
-models = client.models.list()
+resp = client.responses.create(
-
+    model="openai/gpt-4o",
-# Select the first LLM and first embedding models
+    input="How do you do great work? Use the existing knowledge_search tool.",
-model_id = next(m for m in models if m.model_type == "llm").identifier
+    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-embedding_model_id = (
+    include=["file_search_call.results"],
    em := next(m for m in models if m.model_type == "embedding")
 ).identifier
 embedding_dimension = em.metadata["embedding_dimension"]
 vector_db = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="faiss",
 )
 vector_db_id = vector_db.identifier
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
    document_id="document_1",
    content=source,
    mime_type="text/html",
    metadata={},
 )
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
 )
 agent = Agent(
    client,
    model=model_id,
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
 )
 prompt = "How do you do great work?"
 print("prompt>", prompt)
 use_stream = True
 response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session"),
    stream=use_stream,
 )
 # Only call `AgentEventLogger().log(response)` for streaming responses.
 if use_stream:
    for log in AgentEventLogger().log(response):
        log.print()
 else:
    print(response)
 ```
 We will use `uv` to run the script
 ```
 uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
 ```python
 >print(resp.output[1].content[0].text)
 To do great work, consider the following principles:
 1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
 2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
 3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
 4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.
 5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
 6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
 By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
 ```
 rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
 prompt> How do you do great work?
 inference> [knowledge_search(query="What is the key to doing great work")]
 tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
 tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
 inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
 To further clarify, I would suggest that doing great work involves:
 * Completing tasks with high quality and attention to detail
 * Expanding on existing knowledge or ideas
 * Making a positive impact on others through your work
 * Striving for excellence and continuous improvement
 Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
 ```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
 :::tip HuggingFace access
--- a/docs/docs/providers/agents/inline_meta-reference.mdx
+++ b/docs/docs/providers/agents/inline_meta-reference.mdx
@ -14,16 +14,18 @@ Meta's reference implementation of an agent system that can use tools, access ve
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence_store` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No |  |  |
 | `responses_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite |  |
 ## Sample Configuration
 ```yaml
-persistence_store:
+persistence:
-  type: sqlite
+  agent_state:
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/agents_store.db
+    namespace: agents
-responses_store:
+    backend: kv_default
-  type: sqlite
+  responses:
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/responses_store.db
+    table_name: responses
    backend: sql_default
    max_write_queue_size: 10000
    num_writers: 4
 ```
--- a/docs/docs/providers/batches/inline_reference.mdx
+++ b/docs/docs/providers/batches/inline_reference.mdx
@ -14,7 +14,7 @@ Reference implementation of batches API with KVStore persistence.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
+| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Configuration for the key-value store backend. |
 | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
 | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
@ -22,6 +22,6 @@ Reference implementation of batches API with KVStore persistence.
 ```yaml
 kvstore:
-  type: sqlite
+  namespace: batches
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/datasetio/inline_localfs.mdx
+++ b/docs/docs/providers/datasetio/inline_localfs.mdx
@ -14,12 +14,12 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 kvstore:
-  type: sqlite
+  namespace: datasetio::localfs
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/localfs_datasetio.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/datasetio/remote_huggingface.mdx
+++ b/docs/docs/providers/datasetio/remote_huggingface.mdx
@ -14,12 +14,12 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 kvstore:
-  type: sqlite
+  namespace: datasetio::huggingface
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/huggingface_datasetio.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -1,5 +1,7 @@
 ---
-description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+description: "Evaluations
    Llama Stack Evaluation API for running evaluations on model and agent candidates."
 sidebar_label: Eval
 title: Eval
 ---
@ -8,6 +10,8 @@ title: Eval
 ## Overview
-Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Evaluations
    Llama Stack Evaluation API for running evaluations on model and agent candidates.
 This section contains documentation for all available providers for the **eval** API.
--- a/docs/docs/providers/eval/inline_meta-reference.mdx
+++ b/docs/docs/providers/eval/inline_meta-reference.mdx
@ -14,12 +14,12 @@ Meta's reference implementation of evaluation tasks with support for multiple la
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 kvstore:
-  type: sqlite
+  namespace: eval
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -240,6 +240,6 @@ additional_pip_packages:
 - sqlalchemy[asyncio]
 ```
-No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc.
+No other steps are required beyond installing dependencies with `llama stack list-deps <distro> | xargs -L1 uv pip install` and then running `llama stack run`. The CLI will use `module` to install the provider dependencies, retrieve the spec, etc.
 The provider will now be available in Llama Stack with the type `remote::ramalama`.
--- a/docs/docs/providers/files/inline_localfs.mdx
+++ b/docs/docs/providers/files/inline_localfs.mdx
@ -15,7 +15,7 @@ Local filesystem-based file storage provider for managing files and documents lo
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
-| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
+| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
 | `ttl_secs` | `<class 'int'>` | No | 31536000 |  |
 ## Sample Configuration
@ -23,6 +23,6 @@ Local filesystem-based file storage provider for managing files and documents lo
 ```yaml
 storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/dummy/files}
 metadata_store:
-  type: sqlite
+  table_name: files_metadata
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/files_metadata.db
+  backend: sql_default
 ```
--- a/docs/docs/providers/files/remote_s3.mdx
+++ b/docs/docs/providers/files/remote_s3.mdx
@ -20,7 +20,7 @@ AWS S3-based file storage provider for scalable cloud file management with metad
 | `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
 | `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
 | `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
-| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
+| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
 ## Sample Configuration
@ -32,6 +32,6 @@ aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
 endpoint_url: ${env.S3_ENDPOINT_URL:=}
 auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
 metadata_store:
-  type: sqlite
+  table_name: s3_files_metadata
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
+  backend: sql_default
 ```
--- a/docs/docs/providers/index.mdx
+++ b/docs/docs/providers/index.mdx
@ -22,7 +22,6 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 ## Provider Categories
 - **[External Providers](external/index.mdx)** - Guide for building and using external providers
 - **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
 - **[Inference](inference/index.mdx)** - LLM and embedding model providers
 - **[Agents](agents/index.mdx)** - Agentic system providers
 - **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -31,3 +30,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 - **[Vector IO](vector_io/index.mdx)** - Vector database providers
 - **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
 - **[Files](files/index.mdx)** - File system and storage providers
 ## Other information about Providers
 - **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
 - **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack
--- a/docs/docs/providers/openai.mdx
+++ b/docs/docs/providers/openai.mdx
@ -1,3 +1,4 @@
 ---
 title: OpenAI Compatibility
 description: OpenAI API Compatibility
 sidebar_label: OpenAI Compatibility
@ -47,7 +48,7 @@ models = client.models.list()
 #### Responses
-> **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
+> **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API.  See [Known Limitations of the OpenAI-compatible Responses API in Llama Stack](./openai_responses_limitations.mdx) for more details.
 ##### Simple inference
--- a/docs/docs/providers/openai_responses_limitations.mdx
+++ b/docs/docs/providers/openai_responses_limitations.mdx
@ -0,0 +1,301 @@
 ---
 title: Known Limitations of the OpenAI-compatible Responses API in Llama Stack
 description: Limitations of Responses API
 sidebar_label: Limitations of Responses API
 sidebar_position: 1
 ---
 ## Unresolved Issues
 This document outlines known limitations and inconsistencies between Llama Stack's Responses API and OpenAI's Responses API. This comparison is based on OpenAI's API and reflects a comparison with the OpenAI APIs as of October 6, 2025 (OpenAI's client version `openai==1.107`).
 See the OpenAI [changelog](https://platform.openai.com/docs/changelog) for details of any new functionality that has been added since that date. Links to issues are included so readers can read about status, post comments, and/or subscribe for updates relating to any limitations that are of specific interest to them. We would also love any other feedback on any use-cases you try that do not work to help prioritize the pieces left to implement.
 Please open new issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work that does not already have an open issue.
 ### Instructions
 **Status:** Partial Implementation + Work in Progress
 **Issue:** [#3566](https://github.com/llamastack/llama-stack/issues/3566)
 In Llama Stack, the instructions parameter is already implemented for creating a response, but it is not yet included in the output response object.
 ---
 ### Streaming
 **Status:** Partial Implementation
 **Issue:** [#2364](https://github.com/llamastack/llama-stack/issues/2364)
 Streaming functionality for the Responses API is partially implemented and does work to some extent, but some streaming response objects that would be needed for full compatibility are still missing.
 ---
 ### Prompt Templates
 **Status:** Partial Implementation
 **Issue:** [#3321](https://github.com/llamastack/llama-stack/issues/3321)
 OpenAI's platform supports [templated prompts using a structured language](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). These templates can be stored server-side for organizational sharing. This feature is under development for Llama Stack.
 ---
 ### Web-search tool compatibility
 **Status:** Partial Implementation
 Both OpenAI and Llama Stack support a web-search built-in tool.  The [OpenAI documentation](https://platform.openai.com/docs/api-reference/responses/create) for web search tool in a Responses tool list says:
 > The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
 In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`.
 Is that correct?  If so, what are the meanings of each of them?  It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
 also work with Llama Stack.
 The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack.  If feasible, it would be good to support these too.
 ---
 ### Other built-in Tools
 **Status:** Partial Implementation
 OpenAI's Responses API includes an ecosystem of built-in tools (e.g., code interpreter) that lower the barrier to entry for agentic workflows. These tools are typically aligned with specific model training.
 **Current Status in Llama Stack:**
 - Some built-in tools exist (file search, web search)
 - Missing tools include code interpreter, computer use, and image generation
 - Some built-in tools may require additional APIs (e.g., [containers API](https://platform.openai.com/docs/api-reference/containers) for code interpreter)
 It's unclear whether there is demand for additional built-in tools in Llama Stack. No upstream issues have been filed for adding more built-in tools.
 ---
 ### Response Branching
 **Status:** Not Working
 Response branching, as discussed in the [Agents vs OpenAI Responses API documentation](https://llamastack.github.io/docs/building_applications/responses_vs_agents), is not currently functional.
 ---
 ### Include
 **Status:** Not Implemented
 The `include` parameter allows you to provide a list of values that indicate additional information for the system to include in the model response.  The [OpenAI API](https://platform.openai.com/docs/api-reference/responses/create) specifies the following allowed values for this parameter.
 - `web_search_call.action.sources`
 - `code_interpreter_call.outputs`
 - `computer_call_output.output.image_url`
 - `file_search_call.results`
 - `message.input_image.image_url`
 - `message.output_text.logprobs`
 - `reasoning.encrypted_content`
 Some of these are not relevant to Llama Stack in its current form. For example, code interpreter is not implemented (see "Built-in tools" below), so `code_interpreter_call.outputs` would not be a useful directive to Llama Stack.
 However, others might be useful. For example, `message.output_text.logprobs` can be useful for assessing how confident a model is in each token of its output.
 ---
 ### Tool Choice
 **Status:** Not Implemented
 **Issue:** [#3548](https://github.com/llamastack/llama-stack/issues/3548)
 In OpenAI's API, the `tool_choice` parameter allows you to set restrictions or requirements for which tools should be used when generating a response. This feature is not implemented in Llama Stack.
 ---
 ### Safety Identification and Tracking
 **Status:** Not Implemented
 OpenAI's platform allows users to track agentic users using a safety identifier passed with each response. When requests violate moderation or safety rules, account holders are alerted and automated actions can be taken. This capability is not currently available in Llama Stack.
 ---
 ### Connectors
 **Status:** Not Implemented
 Connectors are MCP servers maintained and managed by the Responses API provider. OpenAI has documented their connectors at [https://platform.openai.com/docs/guides/tools-connectors-mcp](https://platform.openai.com/docs/guides/tools-connectors-mcp).
 **Open Questions:**
 - Should Llama Stack include built-in support for some, all, or none of OpenAI's connectors?
 - Should there be a mechanism for administrators to add custom connectors via `run.yaml` or an API?
 ---
 ### Reasoning
 **Status:** Partially Implemented
 The `reasoning` object in the output of Responses works for inference providers such as vLLM that output reasoning traces in chat completions requests.  It does not work for other providers such as OpenAI's hosted service.  See [#3551](https://github.com/llamastack/llama-stack/issues/3551) for more details.
 ---
 ### Service Tier
 **Status:** Not Implemented
 **Issue:** [#3550](https://github.com/llamastack/llama-stack/issues/3550)
 Responses has a field `service_tier` that can be used to prioritize access to inference resources.  Not all inference providers have such a concept, but Llama Stack pass through this value for those providers that do.  Currently it does not.
 ---
 ### Top Logprobs
 **Status:** Not Implemented
 **Issue:** [#3552](https://github.com/llamastack/llama-stack/issues/3552)
 The `top_logprobs` parameter from OpenAI's Responses API extends the functionality obtained by including `message.output_text.logprobs` in the `include` parameter list (as discussed in the Include section above).
 It enables users to also get logprobs for alternative tokens.
 ---
 ### Max Tool Calls
 **Status:** Not Implemented
 **Issue:** [#3563](https://github.com/llamastack/llama-stack/issues/3563)
 The Responses API can accept a `max_tool_calls` parameter that limits the number of tool calls allowed to be executed for a given response. This feature needs full implementation and documentation.
 ---
 ### Max Output Tokens
 **Status:** Not Implemented
 **Issue:** [#3562](https://github.com/llamastack/llama-stack/issues/3562)
 The `max_output_tokens` field limits how many tokens the model is allowed to generate (for both reasoning and output combined).  It is not implemented in Llama Stack.
 ---
 ### Incomplete Details
 **Status:** Not Implemented
 **Issue:** [#3567](https://github.com/llamastack/llama-stack/issues/3567)
 The return object from a call to Responses includes a field for indicating why a response is incomplete if it is.  For example, if the model stops generating because it has reached the specified max output tokens (see above), this field should be set to "IncompleteDetails(reason='max_output_tokens')".  This is not implemented in Llama Stack.
 ---
 ### Metadata
 **Status:** Not Implemented
 **Issue:** [#3564](https://github.com/llamastack/llama-stack/issues/3564)
 Metadata allows you to attach additional information to a response for your own reference and tracking.  It is not implemented in Llama Stack.
 ---
 ### Background
 **Status:** Not Implemented
 **Issue:** [#3568](https://github.com/llamastack/llama-stack/issues/3568)
 [Background mode](https://platform.openai.com/docs/guides/background) in OpenAI Responses lets you start a response generation job and then check back in on it later.  This is useful if you might lose a connection during a generation and want to reconnect later and get the response back (for example if the client is running in a mobile app).  It is not implemented in Llama Stack.
 ---
 ### Global Guardrails
 **Status:** Feature Request
 When calling the OpenAI Responses API, model outputs go through safety models configured by OpenAI administrators. Perhaps Llama Stack should provide a mechanism to configure safety models (or non-model logic) for all Responses requests, either through `run.yaml` or an administrative API.
 ---
 ### User-Controlled Guardrails
 **Status:** Feature Request
 **Issue:** [#3325](https://github.com/llamastack/llama-stack/issues/3325)
 OpenAI has not released a way for users to configure their own guardrails. However, Llama Stack users may want this capability to complement or replace global guardrails. This could be implemented as a non-breaking, additive difference from the OpenAI API.
 ---
 ### MCP Elicitations
 **Status:** Unknown
 Elicitations allow MCP servers to request additional information from users through the client during interactions (e.g., a tool requesting a username before proceeding).
 See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/elicitation) for details.
 **Open Questions:**
 - Does this work in OpenAI's Responses API reference implementation?
 - If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
 - Does this work in Llama Stack?
 ---
 ### MCP Sampling
 **Status:** Unknown
 Sampling allows MCP tools to query the generative AI model. See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/sampling) for details.
 **Open Questions:**
 - Does this work in OpenAI's Responses API reference implementation?
 - If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
 - Does this work in Llama Stack?
 ### Prompt Caching
 **Status:** Unknown
 OpenAI provides a [prompt caching](https://platform.openai.com/docs/guides/prompt-caching) mechanism in Responses that is enabled for its most recent models.
 **Open Questions:**
 - Does this work in Llama Stack?
 - If not, is there a reasonable way to make that work for those inference providers that have this capability by passing through the provided `prompt_cache_key` to the inference provider?
 - Is there a reasonable way to make that work for inference providers that don't build in this capability by doing some sort of caching at the Llama Stack layer?
 ---
 ### Parallel Tool Calls
 **Status:** Rumored Issue
 There are reports that `parallel_tool_calls` may not work correctly. This needs verification and a ticket should be opened if confirmed.
 ---
 ## Resolved Issues
 The following limitations have been addressed in recent releases:
 ### MCP and Function Tools with No Arguments
 **Status:** ✅ Resolved
 MCP and function tools now work correctly even when they have no arguments.
 ---
 ### `require_approval` Parameter for MCP Tools
 **Status:** ✅ Resolved
 The `require_approval` parameter for MCP tools in the Responses API now works correctly.
 ---
 ### MCP Tools with Array-Type Arguments
 **Status:** ✅ Resolved
 **Fixed in:** [#3003](https://github.com/llamastack/llama-stack/pull/3003) (Agent API), [#3602](https://github.com/llamastack/llama-stack/pull/3602) (Responses API)
 MCP tools now correctly handle array-type arguments in both the Agent API and Responses API.
--- a/docs/docs/providers/telemetry/inline_meta-reference.mdx
+++ b/docs/docs/providers/telemetry/inline_meta-reference.mdx
@ -16,14 +16,12 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
 |-------|------|----------|---------|-------------|
 | `otel_exporter_otlp_endpoint` | `str \| None` | No |  | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
 | `service_name` | `<class 'str'>` | No |  | The service name to use for telemetry |
-| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
+| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console) |
 | `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
 ## Sample Configuration
 ```yaml
 service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-sinks: ${env.TELEMETRY_SINKS:=sqlite}
+sinks: ${env.TELEMETRY_SINKS:=}
 sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/trace_store.db
 otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
 ```
--- a/docs/docs/providers/vector_io/inline_chromadb.mdx
+++ b/docs/docs/providers/vector_io/inline_chromadb.mdx
@ -79,13 +79,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
 ## Sample Configuration
 ```yaml
 db_path: ${env.CHROMADB_PATH}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::chroma
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_inline_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/inline_faiss.mdx
+++ b/docs/docs/providers/vector_io/inline_faiss.mdx
@ -95,12 +95,12 @@ more details about Faiss in general.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::faiss
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/inline_meta-reference.mdx
+++ b/docs/docs/providers/vector_io/inline_meta-reference.mdx
@ -14,14 +14,14 @@ Meta's reference implementation of a vector database.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::faiss
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
+  backend: kv_default
 ```
 ## Deprecation Notice
--- a/docs/docs/providers/vector_io/inline_milvus.mdx
+++ b/docs/docs/providers/vector_io/inline_milvus.mdx
@ -17,14 +17,14 @@ Please refer to the remote provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
 ## Sample Configuration
 ```yaml
 db_path: ${env.MILVUS_DB_PATH:=~/.llama/dummy}/milvus.db
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::milvus
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/inline_qdrant.mdx
+++ b/docs/docs/providers/vector_io/inline_qdrant.mdx
@ -98,13 +98,13 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `path` | `<class 'str'>` | No |  |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::qdrant
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -408,13 +408,13 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::sqlite_vec
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
@ -17,15 +17,15 @@ Please refer to the sqlite-vec provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::sqlite_vec
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
+  backend: kv_default
 ```
 ## Deprecation Notice
--- a/docs/docs/providers/vector_io/remote_chromadb.mdx
+++ b/docs/docs/providers/vector_io/remote_chromadb.mdx
@ -78,13 +78,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `str \| None` | No |  |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
 ## Sample Configuration
 ```yaml
 url: ${env.CHROMADB_URL}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::chroma_remote
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_remote_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/remote_milvus.mdx
+++ b/docs/docs/providers/vector_io/remote_milvus.mdx
@ -408,7 +408,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | `uri` | `<class 'str'>` | No |  | The URI of the Milvus server |
 | `token` | `str \| None` | No |  | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
 | `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
 :::note
@ -420,7 +420,7 @@ This configuration class accepts additional fields beyond those listed above. Yo
 ```yaml
 uri: ${env.MILVUS_ENDPOINT}
 token: ${env.MILVUS_TOKEN}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::milvus_remote
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/remote_pgvector.mdx
+++ b/docs/docs/providers/vector_io/remote_pgvector.mdx
@ -218,7 +218,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 | `db` | `str \| None` | No | postgres |  |
 | `user` | `str \| None` | No | postgres |  |
 | `password` | `str \| None` | No | mysecretpassword |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |
+| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
@ -228,7 +228,7 @@ port: ${env.PGVECTOR_PORT:=5432}
 db: ${env.PGVECTOR_DB}
 user: ${env.PGVECTOR_USER}
 password: ${env.PGVECTOR_PASSWORD}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::pgvector
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/remote_qdrant.mdx
+++ b/docs/docs/providers/vector_io/remote_qdrant.mdx
@ -26,13 +26,13 @@ Please refer to the inline provider documentation.
 | `prefix` | `str \| None` | No |  |  |
 | `timeout` | `int \| None` | No |  |  |
 | `host` | `str \| None` | No |  |  |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 api_key: ${env.QDRANT_API_KEY:=}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::qdrant_remote
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/providers/vector_io/remote_weaviate.mdx
+++ b/docs/docs/providers/vector_io/remote_weaviate.mdx
@ -75,14 +75,14 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
 |-------|------|----------|---------|-------------|
 | `weaviate_api_key` | `str \| None` | No |  | The API key for the Weaviate instance |
 | `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |
+| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
 ```yaml
 weaviate_api_key: null
 weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
-kvstore:
+persistence:
-  type: sqlite
+  namespace: vector_io::weaviate
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db
+  backend: kv_default
 ```
--- a/docs/docs/references/llama_stack_client_cli_reference.md
+++ b/docs/docs/references/llama_stack_client_cli_reference.md
@ -32,7 +32,6 @@ Commands:
  scoring_functions  Manage scoring functions.
  shields            Manage safety shield services.
  toolgroups         Manage available tool groups.
  vector_dbs         Manage vector databases.
 ```
 ### `llama-stack-client configure`
@ -211,53 +210,6 @@ Unregister a model from distribution endpoint
 llama-stack-client models unregister <model_id>
 ```
 ## Vector DB Management
 Manage vector databases.
 ### `llama-stack-client vector_dbs list`
 Show available vector dbs on distribution endpoint
 ```bash
 llama-stack-client vector_dbs list
 ```
 ```
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 ┃ identifier               ┃ provider_id ┃ provider_resource_id     ┃ vector_db_type ┃ params                            ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 │ my_demo_vector_db        │ faiss       │ my_demo_vector_db        │                │ embedding_dimension: 768          │
 │                          │             │                          │                │ embedding_model: nomic-embed-text-v1.5 │
 │                          │             │                          │                │ type: vector_db                   │
 │                          │             │                          │                │                                   │
 └──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
 ```
 ### `llama-stack-client vector_dbs register`
 Create a new vector db
 ```bash
 llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```
 Required arguments:
 - `VECTOR_DB_ID`: Vector DB ID
 Optional arguments:
 - `--provider-id`: Provider ID for the vector db
 - `--provider-vector-db-id`: Provider's vector db ID
 - `--embedding-model`: Embedding model to use. Default: `nomic-embed-text-v1.5`
 - `--embedding-dimension`: Dimension of embeddings. Default: 768
 ### `llama-stack-client vector_dbs unregister`
 Delete a vector db
 ```bash
 llama-stack-client vector_dbs unregister <vector-db-id>
 ```
 Required arguments:
 - `VECTOR_DB_ID`: Vector DB ID
 ## Shield Management
 Manage safety shield services.
 ### `llama-stack-client shields list`
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -2864,7 +2864,7 @@
    }
   ],
   "source": [
-    "!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
+    "!llama stack list-deps experimental-post-training | xargs -L1 uv pip install"
   ]
  },
  {
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -38,7 +38,7 @@
   "source": [
    "# NBVAL_SKIP\n",
    "!pip install -U llama-stack\n",
-    "!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
+    "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
   ]
  },
  {
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/crewai/Llama_Stack_CrewAI.ipynb
+++ b/docs/notebooks/crewai/Llama_Stack_CrewAI.ipynb
@ -136,7 +136,8 @@
    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
-    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
+    "        \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
    "        \"uv run --with llama-stack llama stack run starter\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
@ -172,7 +173,7 @@
    "\n",
    "def kill_llama_stack_server():\n",
    "    # Kill any existing llama stack server processes using pkill command\n",
-    "    os.system(\"pkill -f llama_stack.core.server.server\")"
+    "    os.system(\"pkill -f llama_stack.core.server.server\")\n"
   ]
  },
  {
--- a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
+++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
@ -105,7 +105,8 @@
    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
-    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
+    "        \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
    "        \"uv run --with llama-stack llama stack run starter\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -92,7 +92,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
+    "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
    "```"
   ]
  },
--- a/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
@ -81,7 +81,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
+    "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
    "```"
   ]
  },
--- a/docs/openapi_generator/run_openapi_generator.sh
+++ b/docs/openapi_generator/run_openapi_generator.sh
@ -30,3 +30,5 @@ fi
 stack_dir=$(dirname $(dirname $THIS_DIR))
 PYTHONPATH=$PYTHONPATH:$stack_dir \
  python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/static
 cp $stack_dir/docs/static/stainless-llama-stack-spec.yaml $stack_dir/client-sdks/stainless/openapi.yml
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -1,366 +1,399 @@
 {
-  "cells": [
+ "cells": [
-    {
+  {
-      "cell_type": "markdown",
+   "cell_type": "markdown",
-      "id": "c1e7571c",
+   "id": "c1e7571c",
-      "metadata": {
+   "metadata": {
-        "id": "c1e7571c"
+    "id": "c1e7571c"
-      },
+   },
-      "source": [
+   "source": [
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
-        "\n",
+    "\n",
-        "# Llama Stack - Building AI Applications\n",
+    "# Llama Stack - Building AI Applications\n",
-        "\n",
+    "\n",
-        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+    "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
-        "\n",
+    "\n",
-        "Get started with Llama Stack in minutes!\n",
+    "Get started with Llama Stack in minutes!\n",
-        "\n",
+    "\n",
-        "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
+    "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
-        "\n",
+    "\n",
-        "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
+    "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
-        "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
+    "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
-      ]
+   ]
    },
    {
      "cell_type": "markdown",
      "id": "4CV1Q19BDMVw",
      "metadata": {
        "id": "4CV1Q19BDMVw"
      },
      "source": [
        "## Step 1: Install and setup"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "K4AvfUAJZOeS",
      "metadata": {
        "id": "K4AvfUAJZOeS"
      },
      "source": [
        "### 1.1. Install uv and test inference with Ollama\n",
        "\n",
        "We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7a2d7b85",
      "metadata": {},
      "outputs": [],
      "source": [
        "%pip install uv llama_stack llama-stack-client\n",
        "\n",
        "## If running on Collab:\n",
        "# !pip install colab-xterm\n",
        "# %load_ext colabxterm\n",
        "\n",
        "!curl https://ollama.ai/install.sh | sh"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "39fa584b",
      "metadata": {},
      "source": [
        "### 1.2. Test inference with Ollama"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3bf81522",
      "metadata": {},
      "source": [
        "We’ll now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a7e8e0f1",
      "metadata": {},
      "outputs": [],
      "source": [
        "## If running on Colab:\n",
        "# %xterm\n",
        "\n",
        "## To be ran in the terminal:\n",
        "# ollama serve &\n",
        "# ollama run llama3.2:3b --keepalive 60m"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "f3c5f243",
      "metadata": {},
      "source": [
        "If successful, you should see the model respond to a prompt.\n",
        "\n",
        "...\n",
        "```\n",
        ">>> hi\n",
        "Hello! How can I assist you today?\n",
        "```"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "oDUB7M_qe-Gs",
      "metadata": {
        "id": "oDUB7M_qe-Gs"
      },
      "source": [
        "## Step 2: Run the Llama Stack server\n",
        "\n",
        "In this showcase, we will start a Llama Stack server that is running locally."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "732eadc6",
      "metadata": {},
      "source": [
        "### 2.1. Setup the Llama Stack Server"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "J2kGed0R5PSf",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "collapsed": true,
        "id": "J2kGed0R5PSf",
        "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import subprocess\n",
        "\n",
        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
        "!uv run --with llama-stack llama stack build --distro starter\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
        "        text=True\n",
        "    )\n",
        "\n",
        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
        "    return process\n",
        "\n",
        "def wait_for_server_to_start():\n",
        "    import requests\n",
        "    from requests.exceptions import ConnectionError\n",
        "    import time\n",
        "\n",
        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
        "    max_retries = 30\n",
        "    retry_interval = 1\n",
        "\n",
        "    print(\"Waiting for server to start\", end=\"\")\n",
        "    for _ in range(max_retries):\n",
        "        try:\n",
        "            response = requests.get(url)\n",
        "            if response.status_code == 200:\n",
        "                print(\"\\nServer is ready!\")\n",
        "                return True\n",
        "        except ConnectionError:\n",
        "            print(\".\", end=\"\", flush=True)\n",
        "            time.sleep(retry_interval)\n",
        "\n",
        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
        "    return False\n",
        "\n",
        "\n",
        "# use this helper if needed to kill the server\n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c40e9efd",
      "metadata": {},
      "source": [
        "### 2.2. Start the Llama Stack Server"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "id": "f779283d",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Starting Llama Stack server with PID: 787100\n",
            "Waiting for server to start\n",
            "Server is ready!\n"
          ]
        }
      ],
      "source": [
        "server_process = run_llama_stack_server_background()\n",
        "assert wait_for_server_to_start()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "28477c03",
      "metadata": {},
      "source": [
        "## Step 3: Run the demo"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "7da71011",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html\n",
            "prompt> How do you do great work?\n",
            "\u001b[33minference> \u001b[0m\u001b[33m[k\u001b[0m\u001b[33mnowledge\u001b[0m\u001b[33m_search\u001b[0m\u001b[33m(query\u001b[0m\u001b[33m=\"\u001b[0m\u001b[33mWhat\u001b[0m\u001b[33m is\u001b[0m\u001b[33m the\u001b[0m\u001b[33m key\u001b[0m\u001b[33m to\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m\")]\u001b[0m\u001b[97m\u001b[0m\n",
            "\u001b[32mtool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}\u001b[0m\n",
            "\u001b[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n', type='text'), TextContentItem(text=\"Result 1:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 2:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 3:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 4:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 5:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text='END of knowledge_search tool results.\\n', type='text'), TextContentItem(text='The above results were retrieved to help answer the user\\'s query: \"What is the key to doing great work\". Use them as supporting information only in answering this query.\\n', type='text')]\u001b[0m\n",
            "\u001b[33minference> \u001b[0m\u001b[33mDoing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m means\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m something\u001b[0m\u001b[33m important\u001b[0m\u001b[33m so\u001b[0m\u001b[33m well\u001b[0m\u001b[33m that\u001b[0m\u001b[33m you\u001b[0m\u001b[33m expand\u001b[0m\u001b[33m people\u001b[0m\u001b[33m's\u001b[0m\u001b[33m ideas\u001b[0m\u001b[33m of\u001b[0m\u001b[33m what\u001b[0m\u001b[33m's\u001b[0m\u001b[33m possible\u001b[0m\u001b[33m.\u001b[0m\u001b[33m However\u001b[0m\u001b[33m,\u001b[0m\u001b[33m there\u001b[0m\u001b[33m's\u001b[0m\u001b[33m no\u001b[0m\u001b[33m threshold\u001b[0m\u001b[33m for\u001b[0m\u001b[33m importance\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m's\u001b[0m\u001b[33m often\u001b[0m\u001b[33m hard\u001b[0m\u001b[33m to\u001b[0m\u001b[33m judge\u001b[0m\u001b[33m at\u001b[0m\u001b[33m the\u001b[0m\u001b[33m time\u001b[0m\u001b[33m anyway\u001b[0m\u001b[33m.\u001b[0m\u001b[33m Great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m is\u001b[0m\u001b[33m a\u001b[0m\u001b[33m matter\u001b[0m\u001b[33m of\u001b[0m\u001b[33m degree\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m can\u001b[0m\u001b[33m be\u001b[0m\u001b[33m difficult\u001b[0m\u001b[33m to\u001b[0m\u001b[33m determine\u001b[0m\u001b[33m whether\u001b[0m\u001b[33m someone\u001b[0m\u001b[33m has\u001b[0m\u001b[33m done\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m until\u001b[0m\u001b[33m after\u001b[0m\u001b[33m the\u001b[0m\u001b[33m fact\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
            "\u001b[30m\u001b[0m"
          ]
        }
      ],
      "source": [
        "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
        "\n",
        "vector_db_id = \"my_demo_vector_db\"\n",
        "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
        "\n",
        "models = client.models.list()\n",
        "\n",
        "# Select the first ollama and first ollama's embedding model\n",
        "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
        "embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
        "embedding_model_id = embedding_model.identifier\n",
        "embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
        "\n",
        "_ = client.vector_dbs.register(\n",
        "    vector_db_id=vector_db_id,\n",
        "    embedding_model=embedding_model_id,\n",
        "    embedding_dimension=embedding_dimension,\n",
        "    provider_id=\"faiss\",\n",
        ")\n",
        "source = \"https://www.paulgraham.com/greatwork.html\"\n",
        "print(\"rag_tool> Ingesting document:\", source)\n",
        "document = RAGDocument(\n",
        "    document_id=\"document_1\",\n",
        "    content=source,\n",
        "    mime_type=\"text/html\",\n",
        "    metadata={},\n",
        ")\n",
        "client.tool_runtime.rag_tool.insert(\n",
        "    documents=[document],\n",
        "    vector_db_id=vector_db_id,\n",
        "    chunk_size_in_tokens=50,\n",
        ")\n",
        "agent = Agent(\n",
        "    client,\n",
        "    model=model_id,\n",
        "    instructions=\"You are a helpful assistant\",\n",
        "    tools=[\n",
        "        {\n",
        "            \"name\": \"builtin::rag/knowledge_search\",\n",
        "            \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
        "        }\n",
        "    ],\n",
        ")\n",
        "\n",
        "prompt = \"How do you do great work?\"\n",
        "print(\"prompt>\", prompt)\n",
        "\n",
        "response = agent.create_turn(\n",
        "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
        "    session_id=agent.create_session(\"rag_session\"),\n",
        "    stream=True,\n",
        ")\n",
        "\n",
        "for log in AgentEventLogger().log(response):\n",
        "    log.print()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "341aaadf",
      "metadata": {},
      "source": [
        "Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e88e1185",
      "metadata": {},
      "source": [
        "## Next Steps"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "bcb73600",
      "metadata": {},
      "source": [
        "Now you're ready to dive deeper into Llama Stack!\n",
        "- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
        "- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
        "- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
        "- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
        "- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
        "- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
        "- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6"
    }
  },
-  "nbformat": 4,
+  {
-  "nbformat_minor": 5
+   "cell_type": "markdown",
   "id": "4CV1Q19BDMVw",
   "metadata": {
    "id": "4CV1Q19BDMVw"
   },
   "source": [
    "## Step 1: Install and setup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "K4AvfUAJZOeS",
   "metadata": {
    "id": "K4AvfUAJZOeS"
   },
   "source": [
    "### 1.1. Install uv and test inference with Ollama\n",
    "\n",
    "We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a2d7b85",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install uv llama_stack llama-stack-client\n",
    "\n",
    "## If running on Collab:\n",
    "# !pip install colab-xterm\n",
    "# %load_ext colabxterm\n",
    "\n",
    "!curl https://ollama.ai/install.sh | sh"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39fa584b",
   "metadata": {},
   "source": [
    "### 1.2. Test inference with Ollama"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3bf81522",
   "metadata": {},
   "source": [
    "We’ll now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7e8e0f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "## If running on Colab:\n",
    "# %xterm\n",
    "\n",
    "## To be ran in the terminal:\n",
    "# ollama serve &\n",
    "# ollama run llama3.2:3b --keepalive 60m"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3c5f243",
   "metadata": {},
   "source": [
    "If successful, you should see the model respond to a prompt.\n",
    "\n",
    "...\n",
    "```\n",
    ">>> hi\n",
    "Hello! How can I assist you today?\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "oDUB7M_qe-Gs",
   "metadata": {
    "id": "oDUB7M_qe-Gs"
   },
   "source": [
    "## Step 2: Run the Llama Stack server\n",
    "\n",
    "In this showcase, we will start a Llama Stack server that is running locally."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "732eadc6",
   "metadata": {},
   "source": [
    "### 2.1. Setup the Llama Stack Server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "J2kGed0R5PSf",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "J2kGed0R5PSf",
    "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m52 packages\u001b[0m \u001b[2min 1.56s\u001b[0m\u001b[0m\n",
      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 122ms\u001b[0m\u001b[0m\n",
      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 197ms\u001b[0m\u001b[0m\n",
      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 11ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import subprocess\n",
    "\n",
    "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
    "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
    "\n",
    "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
    "!uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\n",
    "\n",
    "def run_llama_stack_server_background():\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
    "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
    "        text=True\n",
    "    )\n",
    "\n",
    "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
    "    return process\n",
    "\n",
    "def wait_for_server_to_start():\n",
    "    import requests\n",
    "    from requests.exceptions import ConnectionError\n",
    "    import time\n",
    "\n",
    "    url = \"http://0.0.0.0:8321/v1/health\"\n",
    "    max_retries = 30\n",
    "    retry_interval = 1\n",
    "\n",
    "    print(\"Waiting for server to start\", end=\"\")\n",
    "    for _ in range(max_retries):\n",
    "        try:\n",
    "            response = requests.get(url)\n",
    "            if response.status_code == 200:\n",
    "                print(\"\\nServer is ready!\")\n",
    "                return True\n",
    "        except ConnectionError:\n",
    "            print(\".\", end=\"\", flush=True)\n",
    "            time.sleep(retry_interval)\n",
    "\n",
    "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
    "    return False\n",
    "\n",
    "\n",
    "# use this helper if needed to kill the server\n",
    "def kill_llama_stack_server():\n",
    "    # Kill any existing llama stack server processes\n",
    "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c40e9efd",
   "metadata": {},
   "source": [
    "### 2.2. Start the Llama Stack Server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f779283d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Llama Stack server with PID: 20778\n",
      "Waiting for server to start........\n",
      "Server is ready!\n"
     ]
    }
   ],
   "source": [
    "server_process = run_llama_stack_server_background()\n",
    "assert wait_for_server_to_start()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28477c03",
   "metadata": {},
   "source": [
    "## Step 3: Run the demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7da71011",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/files \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/vector_stores \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/conversations \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/responses \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prompt> How do you do great work?\n",
      "🤔 Doing great work involves a combination of skills, habits, and mindsets. Here are some key principles:\n",
      "\n",
      "1. **Set Clear Goals**: Start with a clear vision of what you want to achieve. Define specific, measurable, achievable, relevant, and time-bound (SMART) goals.\n",
      "\n",
      "2. **Plan and Prioritize**: Break your goals into smaller, manageable tasks. Prioritize these tasks based on their importance and urgency.\n",
      "\n",
      "3. **Focus on Quality**: Aim for high-quality outcomes rather than just finishing tasks. Pay attention to detail, and ensure your work meets or exceeds standards.\n",
      "\n",
      "4. **Stay Organized**: Keep your workspace, both physical and digital, organized to help you stay focused and efficient.\n",
      "\n",
      "5. **Manage Your Time**: Use time management techniques such as the Pomodoro Technique, time blocking, or the Eisenhower Box to maximize productivity.\n",
      "\n",
      "6. **Seek Feedback and Learn**: Regularly seek feedback from peers, mentors, or supervisors. Use constructive criticism to improve continuously.\n",
      "\n",
      "7. **Innovate and Improve**: Look for ways to improve processes or introduce new ideas. Be open to change and willing to adapt.\n",
      "\n",
      "8. **Stay Motivated and Persistent**: Keep your end goals in mind to stay motivated. Overcome setbacks with resilience and persistence.\n",
      "\n",
      "9. **Balance and Rest**: Ensure you maintain a healthy work-life balance. Take breaks and manage stress to sustain long-term productivity.\n",
      "\n",
      "10. **Reflect and Adjust**: Regularly assess your progress and adjust your strategies as needed. Reflect on what works well and what doesn't.\n",
      "\n",
      "By incorporating these elements, you can consistently produce high-quality work and achieve excellence in your endeavors.\n"
     ]
    }
   ],
   "source": [
    "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
    "import requests\n",
    "\n",
    "vector_store_id = \"my_demo_vector_db\"\n",
    "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
    "\n",
    "models = client.models.list()\n",
    "\n",
    "# Select the first ollama and first ollama's embedding model\n",
    "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
    "\n",
    "\n",
    "source = \"https://www.paulgraham.com/greatwork.html\"\n",
    "response = requests.get(source)\n",
    "file = client.files.create(\n",
    "    file=response.content,\n",
    "    purpose='assistants'\n",
    ")\n",
    "vector_store = client.vector_stores.create(\n",
    "    name=vector_store_id,\n",
    "    file_ids=[file.id],\n",
    ")\n",
    "\n",
    "agent = Agent(\n",
    "    client,\n",
    "    model=model_id,\n",
    "    instructions=\"You are a helpful assistant\",\n",
    "    tools=[\n",
    "        {\n",
    "            \"type\": \"file_search\",\n",
    "            \"vector_store_ids\": [vector_store_id],\n",
    "        }\n",
    "    ],\n",
    ")\n",
    "\n",
    "prompt = \"How do you do great work?\"\n",
    "print(\"prompt>\", prompt)\n",
    "\n",
    "response = agent.create_turn(\n",
    "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "    session_id=agent.create_session(\"rag_session\"),\n",
    "    stream=True,\n",
    ")\n",
    "\n",
    "for log in AgentEventLogger().log(response):\n",
    "    print(log, end=\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "341aaadf",
   "metadata": {},
   "source": [
    "Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e88e1185",
   "metadata": {},
   "source": [
    "## Next Steps"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcb73600",
   "metadata": {},
   "source": [
    "Now you're ready to dive deeper into Llama Stack!\n",
    "- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
    "- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
    "- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
    "- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
    "- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
    "- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
    "- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/src/pages/index.js
+++ b/docs/src/pages/index.js
@ -47,11 +47,11 @@ function QuickStart() {
              <pre><code>{`# Install uv and start Ollama
 ollama run llama3.2:3b --keepalive 60m
 # Install server dependencies
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
 # Run Llama Stack server
-OLLAMA_URL=http://localhost:11434 \\
+OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
  uv run --with llama-stack \\
  llama stack build --distro starter \\
  --image-type venv --run
 # Try the Python SDK
 from llama_stack_client import LlamaStackClient
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -1569,16 +1569,16 @@ paths:
        required: true
      deprecated: true
      x-llama-stack-extra-body-params:
-        - name: shields
+        - name: guardrails
          schema:
            type: array
            items:
              oneOf:
                - type: string
-                - $ref: '#/components/schemas/ResponseShieldSpec'
+                - $ref: '#/components/schemas/ResponseGuardrailSpec'
          description: >-
-            List of shields to apply during response generation. Shields provide safety
+            List of guardrails to apply during response generation. Guardrails provide
-            and content moderation.
+            safety and content moderation.
          required: false
  /v1/openai/v1/responses/{response_id}:
    get:
@ -2600,238 +2600,6 @@ paths:
              $ref: '#/components/schemas/SupervisedFineTuneRequest'
        required: true
      deprecated: true
  /v1/telemetry/metrics/{metric_name}:
    post:
      responses:
        '200':
          description: A QueryMetricsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QueryMetricsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query metrics.
      description: Query metrics.
      parameters:
        - name: metric_name
          in: path
          description: The name of the metric to query.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QueryMetricsRequest'
        required: true
      deprecated: true
  /v1/telemetry/spans:
    post:
      responses:
        '200':
          description: A QuerySpansResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QuerySpansResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query spans.
      description: Query spans.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QuerySpansRequest'
        required: true
      deprecated: true
  /v1/telemetry/spans/export:
    post:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Save spans to a dataset.
      description: Save spans to a dataset.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/SaveSpansToDatasetRequest'
        required: true
      deprecated: true
  /v1/telemetry/spans/{span_id}/tree:
    post:
      responses:
        '200':
          description: A QuerySpanTreeResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QuerySpanTreeResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a span tree by its ID.
      description: Get a span tree by its ID.
      parameters:
        - name: span_id
          in: path
          description: The ID of the span to get the tree from.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GetSpanTreeRequest'
        required: true
      deprecated: true
  /v1/telemetry/traces:
    post:
      responses:
        '200':
          description: A QueryTracesResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QueryTracesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query traces.
      description: Query traces.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QueryTracesRequest'
        required: true
      deprecated: true
  /v1/telemetry/traces/{trace_id}:
    get:
      responses:
        '200':
          description: A Trace.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Trace'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a trace by its ID.
      description: Get a trace by its ID.
      parameters:
        - name: trace_id
          in: path
          description: The ID of the trace to get.
          required: true
          schema:
            type: string
      deprecated: true
  /v1/telemetry/traces/{trace_id}/spans/{span_id}:
    get:
      responses:
        '200':
          description: A Span.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Span'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a span by its ID.
      description: Get a span by its ID.
      parameters:
        - name: trace_id
          in: path
          description: >-
            The ID of the trace to get the span from.
          required: true
          schema:
            type: string
        - name: span_id
          in: path
          description: The ID of the span to get.
          required: true
          schema:
            type: string
      deprecated: true
 jsonSchemaDialect: >-
  https://json-schema.org/draft/2020-12/schema
 components:
@ -4346,7 +4114,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -4535,7 +4303,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -6564,6 +6332,25 @@ components:
          url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation'
          container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation'
          file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath'
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
          description: >-
            Content part type identifier, always "refusal"
        refusal:
          type: string
          description: Refusal text supplied by the model
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
      description: >-
        Refusal content within a streamed response part.
    OpenAIResponseError:
      type: object
      properties:
@ -6590,6 +6377,8 @@ components:
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    "OpenAIResponseInputFunctionToolCallOutput":
      type: object
@ -6945,6 +6734,10 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
        instructions:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        input:
          type: array
          items:
@ -6985,6 +6778,15 @@ components:
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
          mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
    OpenAIResponseOutputMessageContent:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
        - $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
      discriminator:
        propertyName: type
        mapping:
          output_text: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
          refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
    "OpenAIResponseOutputMessageContentOutputText":
      type: object
      properties:
        text:
@ -7379,18 +7181,18 @@ components:
        - total_tokens
      title: OpenAIResponseUsage
      description: Usage information for OpenAI response.
-    ResponseShieldSpec:
+    ResponseGuardrailSpec:
      type: object
      properties:
        type:
          type: string
-          description: The type/identifier of the shield.
+          description: The type/identifier of the guardrail.
      additionalProperties: false
      required:
        - type
-      title: ResponseShieldSpec
+      title: ResponseGuardrailSpec
      description: >-
-        Specification for a shield to apply during response generation.
+        Specification for a guardrail to apply during response generation.
    OpenAIResponseInputTool:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -7605,6 +7407,10 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
        instructions:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
      additionalProperties: false
      required:
        - created_at
@ -7696,25 +7502,6 @@ components:
      title: OpenAIResponseContentPartReasoningText
      description: >-
        Reasoning text emitted as part of a streamed response.
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
          description: >-
            Content part type identifier, always "refusal"
        refusal:
          type: string
          description: Refusal text supplied by the model
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
      description: >-
        Refusal content within a streamed response part.
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -10341,434 +10128,6 @@ components:
        - hyperparam_search_config
        - logger_config
      title: SupervisedFineTuneRequest
    QueryMetricsRequest:
      type: object
      properties:
        start_time:
          type: integer
          description: The start time of the metric to query.
        end_time:
          type: integer
          description: The end time of the metric to query.
        granularity:
          type: string
          description: The granularity of the metric to query.
        query_type:
          type: string
          enum:
            - range
            - instant
          description: The type of query to perform.
        label_matchers:
          type: array
          items:
            type: object
            properties:
              name:
                type: string
                description: The name of the label to match
              value:
                type: string
                description: The value to match against
              operator:
                type: string
                enum:
                  - '='
                  - '!='
                  - =~
                  - '!~'
                description: >-
                  The comparison operator to use for matching
                default: '='
            additionalProperties: false
            required:
              - name
              - value
              - operator
            title: MetricLabelMatcher
            description: >-
              A matcher for filtering metrics by label values.
          description: >-
            The label matchers to apply to the metric.
      additionalProperties: false
      required:
        - start_time
        - query_type
      title: QueryMetricsRequest
    MetricDataPoint:
      type: object
      properties:
        timestamp:
          type: integer
          description: >-
            Unix timestamp when the metric value was recorded
        value:
          type: number
          description: >-
            The numeric value of the metric at this timestamp
        unit:
          type: string
      additionalProperties: false
      required:
        - timestamp
        - value
        - unit
      title: MetricDataPoint
      description: >-
        A single data point in a metric time series.
    MetricLabel:
      type: object
      properties:
        name:
          type: string
          description: The name of the label
        value:
          type: string
          description: The value of the label
      additionalProperties: false
      required:
        - name
        - value
      title: MetricLabel
      description: A label associated with a metric.
    MetricSeries:
      type: object
      properties:
        metric:
          type: string
          description: The name of the metric
        labels:
          type: array
          items:
            $ref: '#/components/schemas/MetricLabel'
          description: >-
            List of labels associated with this metric series
        values:
          type: array
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          description: >-
            List of data points in chronological order
      additionalProperties: false
      required:
        - metric
        - labels
        - values
      title: MetricSeries
      description: A time series of metric data points.
    QueryMetricsResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/MetricSeries'
          description: >-
            List of metric series matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QueryMetricsResponse
      description: >-
        Response containing metric time series data.
    QueryCondition:
      type: object
      properties:
        key:
          type: string
          description: The attribute key to filter on
        op:
          $ref: '#/components/schemas/QueryConditionOp'
          description: The comparison operator to apply
        value:
          oneOf:
            - type: 'null'
            - type: boolean
            - type: number
            - type: string
            - type: array
            - type: object
          description: The value to compare against
      additionalProperties: false
      required:
        - key
        - op
        - value
      title: QueryCondition
      description: A condition for filtering query results.
    QueryConditionOp:
      type: string
      enum:
        - eq
        - ne
        - gt
        - lt
      title: QueryConditionOp
      description: >-
        Comparison operators for query conditions.
    QuerySpansRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the spans.
        attributes_to_return:
          type: array
          items:
            type: string
          description: The attributes to return in the spans.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      required:
        - attribute_filters
        - attributes_to_return
      title: QuerySpansRequest
    Span:
      type: object
      properties:
        span_id:
          type: string
          description: Unique identifier for the span
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this span belongs to
        parent_span_id:
          type: string
          description: >-
            (Optional) Unique identifier for the parent span, if this is a child span
        name:
          type: string
          description: >-
            Human-readable name describing the operation this span represents
        start_time:
          type: string
          format: date-time
          description: Timestamp when the operation began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the operation finished, if completed
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Key-value pairs containing additional metadata about the span
      additionalProperties: false
      required:
        - span_id
        - trace_id
        - name
        - start_time
      title: Span
      description: >-
        A span representing a single operation within a trace.
    QuerySpansResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/Span'
          description: >-
            List of spans matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QuerySpansResponse
      description: Response containing a list of spans.
    SaveSpansToDatasetRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the spans.
        attributes_to_save:
          type: array
          items:
            type: string
          description: The attributes to save to the dataset.
        dataset_id:
          type: string
          description: >-
            The ID of the dataset to save the spans to.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      required:
        - attribute_filters
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
    GetSpanTreeRequest:
      type: object
      properties:
        attributes_to_return:
          type: array
          items:
            type: string
          description: The attributes to return in the tree.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      title: GetSpanTreeRequest
    SpanStatus:
      type: string
      enum:
        - ok
        - error
      title: SpanStatus
      description: >-
        The status of a span indicating whether it completed successfully or with
        an error.
    SpanWithStatus:
      type: object
      properties:
        span_id:
          type: string
          description: Unique identifier for the span
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this span belongs to
        parent_span_id:
          type: string
          description: >-
            (Optional) Unique identifier for the parent span, if this is a child span
        name:
          type: string
          description: >-
            Human-readable name describing the operation this span represents
        start_time:
          type: string
          format: date-time
          description: Timestamp when the operation began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the operation finished, if completed
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Key-value pairs containing additional metadata about the span
        status:
          $ref: '#/components/schemas/SpanStatus'
          description: >-
            (Optional) The current status of the span
      additionalProperties: false
      required:
        - span_id
        - trace_id
        - name
        - start_time
      title: SpanWithStatus
      description: A span that includes status information.
    QuerySpanTreeResponse:
      type: object
      properties:
        data:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/SpanWithStatus'
          description: >-
            Dictionary mapping span IDs to spans with status information
      additionalProperties: false
      required:
        - data
      title: QuerySpanTreeResponse
      description: >-
        Response containing a tree structure of spans.
    QueryTracesRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the traces.
        limit:
          type: integer
          description: The limit of traces to return.
        offset:
          type: integer
          description: The offset of the traces to return.
        order_by:
          type: array
          items:
            type: string
          description: The order by of the traces to return.
      additionalProperties: false
      title: QueryTracesRequest
    Trace:
      type: object
      properties:
        trace_id:
          type: string
          description: Unique identifier for the trace
        root_span_id:
          type: string
          description: >-
            Unique identifier for the root span that started this trace
        start_time:
          type: string
          format: date-time
          description: Timestamp when the trace began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the trace finished, if completed
      additionalProperties: false
      required:
        - trace_id
        - root_span_id
        - start_time
      title: Trace
      description: >-
        A trace representing the complete execution path of a request across multiple
        operations.
    QueryTracesResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/Trace'
          description: >-
            List of traces matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QueryTracesResponse
      description: Response containing a list of traces.
  responses:
    BadRequest400:
      description: The request was invalid or malformed
@ -10845,9 +10204,9 @@ tags:
  - name: Datasets
    description: ''
  - name: Eval
-    description: ''
+    description: >-
    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
    x-displayName: Evaluations
  - name: Files
    description: >-
      This API is used to upload documents that can be used with other Llama Stack
@ -10874,8 +10233,6 @@ tags:
  - name: Safety
    description: OpenAI-compatible Moderations API.
    x-displayName: Safety
  - name: Telemetry
    description: ''
  - name: VectorIO
    description: ''
 x-tagGroups:
@ -10891,5 +10248,4 @@ x-tagGroups:
      - Models
      - PostTraining (Coming Soon)
      - Safety
      - Telemetry
      - VectorIO
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -1711,343 +1711,6 @@
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/metrics/{metric_name}": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A QueryMetricsResponse.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/QueryMetricsResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Query metrics.",
                "description": "Query metrics.",
                "parameters": [
                    {
                        "name": "metric_name",
                        "in": "path",
                        "description": "The name of the metric to query.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/QueryMetricsRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/spans": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A QuerySpansResponse.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/QuerySpansResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Query spans.",
                "description": "Query spans.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/QuerySpansRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/spans/export": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Save spans to a dataset.",
                "description": "Save spans to a dataset.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/SaveSpansToDatasetRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/spans/{span_id}/tree": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A QuerySpanTreeResponse.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/QuerySpanTreeResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Get a span tree by its ID.",
                "description": "Get a span tree by its ID.",
                "parameters": [
                    {
                        "name": "span_id",
                        "in": "path",
                        "description": "The ID of the span to get the tree from.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/GetSpanTreeRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/traces": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A QueryTracesResponse.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/QueryTracesResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Query traces.",
                "description": "Query traces.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/QueryTracesRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/traces/{trace_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A Trace.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Trace"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Get a trace by its ID.",
                "description": "Get a trace by its ID.",
                "parameters": [
                    {
                        "name": "trace_id",
                        "in": "path",
                        "description": "The ID of the trace to get.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        },
        "/v1alpha/telemetry/traces/{trace_id}/spans/{span_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A Span.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Span"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Get a span by its ID.",
                "description": "Get a span by its ID.",
                "parameters": [
                    {
                        "name": "trace_id",
                        "in": "path",
                        "description": "The ID of the trace to get the span from.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "span_id",
                        "in": "path",
                        "description": "The ID of the span to get.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        }
    },
    "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
@ -2187,7 +1850,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -4320,7 +3983,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -5765,561 +5428,6 @@
                    "logger_config"
                ],
                "title": "SupervisedFineTuneRequest"
            },
            "QueryMetricsRequest": {
                "type": "object",
                "properties": {
                    "start_time": {
                        "type": "integer",
                        "description": "The start time of the metric to query."
                    },
                    "end_time": {
                        "type": "integer",
                        "description": "The end time of the metric to query."
                    },
                    "granularity": {
                        "type": "string",
                        "description": "The granularity of the metric to query."
                    },
                    "query_type": {
                        "type": "string",
                        "enum": [
                            "range",
                            "instant"
                        ],
                        "description": "The type of query to perform."
                    },
                    "label_matchers": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {
                                    "type": "string",
                                    "description": "The name of the label to match"
                                },
                                "value": {
                                    "type": "string",
                                    "description": "The value to match against"
                                },
                                "operator": {
                                    "type": "string",
                                    "enum": [
                                        "=",
                                        "!=",
                                        "=~",
                                        "!~"
                                    ],
                                    "description": "The comparison operator to use for matching",
                                    "default": "="
                                }
                            },
                            "additionalProperties": false,
                            "required": [
                                "name",
                                "value",
                                "operator"
                            ],
                            "title": "MetricLabelMatcher",
                            "description": "A matcher for filtering metrics by label values."
                        },
                        "description": "The label matchers to apply to the metric."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "start_time",
                    "query_type"
                ],
                "title": "QueryMetricsRequest"
            },
            "MetricDataPoint": {
                "type": "object",
                "properties": {
                    "timestamp": {
                        "type": "integer",
                        "description": "Unix timestamp when the metric value was recorded"
                    },
                    "value": {
                        "type": "number",
                        "description": "The numeric value of the metric at this timestamp"
                    },
                    "unit": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "timestamp",
                    "value",
                    "unit"
                ],
                "title": "MetricDataPoint",
                "description": "A single data point in a metric time series."
            },
            "MetricLabel": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "The name of the label"
                    },
                    "value": {
                        "type": "string",
                        "description": "The value of the label"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "name",
                    "value"
                ],
                "title": "MetricLabel",
                "description": "A label associated with a metric."
            },
            "MetricSeries": {
                "type": "object",
                "properties": {
                    "metric": {
                        "type": "string",
                        "description": "The name of the metric"
                    },
                    "labels": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/MetricLabel"
                        },
                        "description": "List of labels associated with this metric series"
                    },
                    "values": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/MetricDataPoint"
                        },
                        "description": "List of data points in chronological order"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "metric",
                    "labels",
                    "values"
                ],
                "title": "MetricSeries",
                "description": "A time series of metric data points."
            },
            "QueryMetricsResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/MetricSeries"
                        },
                        "description": "List of metric series matching the query criteria"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "QueryMetricsResponse",
                "description": "Response containing metric time series data."
            },
            "QueryCondition": {
                "type": "object",
                "properties": {
                    "key": {
                        "type": "string",
                        "description": "The attribute key to filter on"
                    },
                    "op": {
                        "$ref": "#/components/schemas/QueryConditionOp",
                        "description": "The comparison operator to apply"
                    },
                    "value": {
                        "oneOf": [
                            {
                                "type": "null"
                            },
                            {
                                "type": "boolean"
                            },
                            {
                                "type": "number"
                            },
                            {
                                "type": "string"
                            },
                            {
                                "type": "array"
                            },
                            {
                                "type": "object"
                            }
                        ],
                        "description": "The value to compare against"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "key",
                    "op",
                    "value"
                ],
                "title": "QueryCondition",
                "description": "A condition for filtering query results."
            },
            "QueryConditionOp": {
                "type": "string",
                "enum": [
                    "eq",
                    "ne",
                    "gt",
                    "lt"
                ],
                "title": "QueryConditionOp",
                "description": "Comparison operators for query conditions."
            },
            "QuerySpansRequest": {
                "type": "object",
                "properties": {
                    "attribute_filters": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/QueryCondition"
                        },
                        "description": "The attribute filters to apply to the spans."
                    },
                    "attributes_to_return": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The attributes to return in the spans."
                    },
                    "max_depth": {
                        "type": "integer",
                        "description": "The maximum depth of the tree."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "attribute_filters",
                    "attributes_to_return"
                ],
                "title": "QuerySpansRequest"
            },
            "Span": {
                "type": "object",
                "properties": {
                    "span_id": {
                        "type": "string",
                        "description": "Unique identifier for the span"
                    },
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace this span belongs to"
                    },
                    "parent_span_id": {
                        "type": "string",
                        "description": "(Optional) Unique identifier for the parent span, if this is a child span"
                    },
                    "name": {
                        "type": "string",
                        "description": "Human-readable name describing the operation this span represents"
                    },
                    "start_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the operation began"
                    },
                    "end_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "(Optional) Timestamp when the operation finished, if completed"
                    },
                    "attributes": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "(Optional) Key-value pairs containing additional metadata about the span"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "span_id",
                    "trace_id",
                    "name",
                    "start_time"
                ],
                "title": "Span",
                "description": "A span representing a single operation within a trace."
            },
            "QuerySpansResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Span"
                        },
                        "description": "List of spans matching the query criteria"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "QuerySpansResponse",
                "description": "Response containing a list of spans."
            },
            "SaveSpansToDatasetRequest": {
                "type": "object",
                "properties": {
                    "attribute_filters": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/QueryCondition"
                        },
                        "description": "The attribute filters to apply to the spans."
                    },
                    "attributes_to_save": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The attributes to save to the dataset."
                    },
                    "dataset_id": {
                        "type": "string",
                        "description": "The ID of the dataset to save the spans to."
                    },
                    "max_depth": {
                        "type": "integer",
                        "description": "The maximum depth of the tree."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "attribute_filters",
                    "attributes_to_save",
                    "dataset_id"
                ],
                "title": "SaveSpansToDatasetRequest"
            },
            "GetSpanTreeRequest": {
                "type": "object",
                "properties": {
                    "attributes_to_return": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The attributes to return in the tree."
                    },
                    "max_depth": {
                        "type": "integer",
                        "description": "The maximum depth of the tree."
                    }
                },
                "additionalProperties": false,
                "title": "GetSpanTreeRequest"
            },
            "SpanStatus": {
                "type": "string",
                "enum": [
                    "ok",
                    "error"
                ],
                "title": "SpanStatus",
                "description": "The status of a span indicating whether it completed successfully or with an error."
            },
            "SpanWithStatus": {
                "type": "object",
                "properties": {
                    "span_id": {
                        "type": "string",
                        "description": "Unique identifier for the span"
                    },
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace this span belongs to"
                    },
                    "parent_span_id": {
                        "type": "string",
                        "description": "(Optional) Unique identifier for the parent span, if this is a child span"
                    },
                    "name": {
                        "type": "string",
                        "description": "Human-readable name describing the operation this span represents"
                    },
                    "start_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the operation began"
                    },
                    "end_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "(Optional) Timestamp when the operation finished, if completed"
                    },
                    "attributes": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "(Optional) Key-value pairs containing additional metadata about the span"
                    },
                    "status": {
                        "$ref": "#/components/schemas/SpanStatus",
                        "description": "(Optional) The current status of the span"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "span_id",
                    "trace_id",
                    "name",
                    "start_time"
                ],
                "title": "SpanWithStatus",
                "description": "A span that includes status information."
            },
            "QuerySpanTreeResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/SpanWithStatus"
                        },
                        "description": "Dictionary mapping span IDs to spans with status information"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "QuerySpanTreeResponse",
                "description": "Response containing a tree structure of spans."
            },
            "QueryTracesRequest": {
                "type": "object",
                "properties": {
                    "attribute_filters": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/QueryCondition"
                        },
                        "description": "The attribute filters to apply to the traces."
                    },
                    "limit": {
                        "type": "integer",
                        "description": "The limit of traces to return."
                    },
                    "offset": {
                        "type": "integer",
                        "description": "The offset of the traces to return."
                    },
                    "order_by": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The order by of the traces to return."
                    }
                },
                "additionalProperties": false,
                "title": "QueryTracesRequest"
            },
            "Trace": {
                "type": "object",
                "properties": {
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace"
                    },
                    "root_span_id": {
                        "type": "string",
                        "description": "Unique identifier for the root span that started this trace"
                    },
                    "start_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the trace began"
                    },
                    "end_time": {
                        "type": "string",
                        "format": "date-time",
                        "description": "(Optional) Timestamp when the trace finished, if completed"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "trace_id",
                    "root_span_id",
                    "start_time"
                ],
                "title": "Trace",
                "description": "A trace representing the complete execution path of a request across multiple operations."
            },
            "QueryTracesResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Trace"
                        },
                        "description": "List of traces matching the query criteria"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "QueryTracesResponse",
                "description": "Response containing a list of traces."
            }
        },
        "responses": {
@ -6410,16 +5518,12 @@
        },
        {
            "name": "Eval",
-            "description": "",
+            "description": "Llama Stack Evaluation API for running evaluations on model and agent candidates.",
-            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+            "x-displayName": "Evaluations"
        },
        {
            "name": "PostTraining (Coming Soon)",
            "description": ""
        },
        {
            "name": "Telemetry",
            "description": ""
        }
    ],
    "x-tagGroups": [
@ -6431,8 +5535,7 @@
                "DatasetIO",
                "Datasets",
                "Eval",
-                "PostTraining (Coming Soon)",
+                "PostTraining (Coming Soon)"
                "Telemetry"
            ]
        }
    ]
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1224,238 +1224,6 @@ paths:
              $ref: '#/components/schemas/SupervisedFineTuneRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/metrics/{metric_name}:
    post:
      responses:
        '200':
          description: A QueryMetricsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QueryMetricsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query metrics.
      description: Query metrics.
      parameters:
        - name: metric_name
          in: path
          description: The name of the metric to query.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QueryMetricsRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/spans:
    post:
      responses:
        '200':
          description: A QuerySpansResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QuerySpansResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query spans.
      description: Query spans.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QuerySpansRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/spans/export:
    post:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Save spans to a dataset.
      description: Save spans to a dataset.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/SaveSpansToDatasetRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/spans/{span_id}/tree:
    post:
      responses:
        '200':
          description: A QuerySpanTreeResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QuerySpanTreeResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a span tree by its ID.
      description: Get a span tree by its ID.
      parameters:
        - name: span_id
          in: path
          description: The ID of the span to get the tree from.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GetSpanTreeRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/traces:
    post:
      responses:
        '200':
          description: A QueryTracesResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QueryTracesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Query traces.
      description: Query traces.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/QueryTracesRequest'
        required: true
      deprecated: false
  /v1alpha/telemetry/traces/{trace_id}:
    get:
      responses:
        '200':
          description: A Trace.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Trace'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a trace by its ID.
      description: Get a trace by its ID.
      parameters:
        - name: trace_id
          in: path
          description: The ID of the trace to get.
          required: true
          schema:
            type: string
      deprecated: false
  /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}:
    get:
      responses:
        '200':
          description: A Span.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Span'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Get a span by its ID.
      description: Get a span by its ID.
      parameters:
        - name: trace_id
          in: path
          description: >-
            The ID of the trace to get the span from.
          required: true
          schema:
            type: string
        - name: span_id
          in: path
          description: The ID of the span to get.
          required: true
          schema:
            type: string
      deprecated: false
 jsonSchemaDialect: >-
  https://json-schema.org/draft/2020-12/schema
 components:
@ -1552,7 +1320,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -3159,7 +2927,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -4249,434 +4017,6 @@ components:
        - hyperparam_search_config
        - logger_config
      title: SupervisedFineTuneRequest
    QueryMetricsRequest:
      type: object
      properties:
        start_time:
          type: integer
          description: The start time of the metric to query.
        end_time:
          type: integer
          description: The end time of the metric to query.
        granularity:
          type: string
          description: The granularity of the metric to query.
        query_type:
          type: string
          enum:
            - range
            - instant
          description: The type of query to perform.
        label_matchers:
          type: array
          items:
            type: object
            properties:
              name:
                type: string
                description: The name of the label to match
              value:
                type: string
                description: The value to match against
              operator:
                type: string
                enum:
                  - '='
                  - '!='
                  - =~
                  - '!~'
                description: >-
                  The comparison operator to use for matching
                default: '='
            additionalProperties: false
            required:
              - name
              - value
              - operator
            title: MetricLabelMatcher
            description: >-
              A matcher for filtering metrics by label values.
          description: >-
            The label matchers to apply to the metric.
      additionalProperties: false
      required:
        - start_time
        - query_type
      title: QueryMetricsRequest
    MetricDataPoint:
      type: object
      properties:
        timestamp:
          type: integer
          description: >-
            Unix timestamp when the metric value was recorded
        value:
          type: number
          description: >-
            The numeric value of the metric at this timestamp
        unit:
          type: string
      additionalProperties: false
      required:
        - timestamp
        - value
        - unit
      title: MetricDataPoint
      description: >-
        A single data point in a metric time series.
    MetricLabel:
      type: object
      properties:
        name:
          type: string
          description: The name of the label
        value:
          type: string
          description: The value of the label
      additionalProperties: false
      required:
        - name
        - value
      title: MetricLabel
      description: A label associated with a metric.
    MetricSeries:
      type: object
      properties:
        metric:
          type: string
          description: The name of the metric
        labels:
          type: array
          items:
            $ref: '#/components/schemas/MetricLabel'
          description: >-
            List of labels associated with this metric series
        values:
          type: array
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          description: >-
            List of data points in chronological order
      additionalProperties: false
      required:
        - metric
        - labels
        - values
      title: MetricSeries
      description: A time series of metric data points.
    QueryMetricsResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/MetricSeries'
          description: >-
            List of metric series matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QueryMetricsResponse
      description: >-
        Response containing metric time series data.
    QueryCondition:
      type: object
      properties:
        key:
          type: string
          description: The attribute key to filter on
        op:
          $ref: '#/components/schemas/QueryConditionOp'
          description: The comparison operator to apply
        value:
          oneOf:
            - type: 'null'
            - type: boolean
            - type: number
            - type: string
            - type: array
            - type: object
          description: The value to compare against
      additionalProperties: false
      required:
        - key
        - op
        - value
      title: QueryCondition
      description: A condition for filtering query results.
    QueryConditionOp:
      type: string
      enum:
        - eq
        - ne
        - gt
        - lt
      title: QueryConditionOp
      description: >-
        Comparison operators for query conditions.
    QuerySpansRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the spans.
        attributes_to_return:
          type: array
          items:
            type: string
          description: The attributes to return in the spans.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      required:
        - attribute_filters
        - attributes_to_return
      title: QuerySpansRequest
    Span:
      type: object
      properties:
        span_id:
          type: string
          description: Unique identifier for the span
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this span belongs to
        parent_span_id:
          type: string
          description: >-
            (Optional) Unique identifier for the parent span, if this is a child span
        name:
          type: string
          description: >-
            Human-readable name describing the operation this span represents
        start_time:
          type: string
          format: date-time
          description: Timestamp when the operation began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the operation finished, if completed
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Key-value pairs containing additional metadata about the span
      additionalProperties: false
      required:
        - span_id
        - trace_id
        - name
        - start_time
      title: Span
      description: >-
        A span representing a single operation within a trace.
    QuerySpansResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/Span'
          description: >-
            List of spans matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QuerySpansResponse
      description: Response containing a list of spans.
    SaveSpansToDatasetRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the spans.
        attributes_to_save:
          type: array
          items:
            type: string
          description: The attributes to save to the dataset.
        dataset_id:
          type: string
          description: >-
            The ID of the dataset to save the spans to.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      required:
        - attribute_filters
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
    GetSpanTreeRequest:
      type: object
      properties:
        attributes_to_return:
          type: array
          items:
            type: string
          description: The attributes to return in the tree.
        max_depth:
          type: integer
          description: The maximum depth of the tree.
      additionalProperties: false
      title: GetSpanTreeRequest
    SpanStatus:
      type: string
      enum:
        - ok
        - error
      title: SpanStatus
      description: >-
        The status of a span indicating whether it completed successfully or with
        an error.
    SpanWithStatus:
      type: object
      properties:
        span_id:
          type: string
          description: Unique identifier for the span
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this span belongs to
        parent_span_id:
          type: string
          description: >-
            (Optional) Unique identifier for the parent span, if this is a child span
        name:
          type: string
          description: >-
            Human-readable name describing the operation this span represents
        start_time:
          type: string
          format: date-time
          description: Timestamp when the operation began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the operation finished, if completed
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Key-value pairs containing additional metadata about the span
        status:
          $ref: '#/components/schemas/SpanStatus'
          description: >-
            (Optional) The current status of the span
      additionalProperties: false
      required:
        - span_id
        - trace_id
        - name
        - start_time
      title: SpanWithStatus
      description: A span that includes status information.
    QuerySpanTreeResponse:
      type: object
      properties:
        data:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/SpanWithStatus'
          description: >-
            Dictionary mapping span IDs to spans with status information
      additionalProperties: false
      required:
        - data
      title: QuerySpanTreeResponse
      description: >-
        Response containing a tree structure of spans.
    QueryTracesRequest:
      type: object
      properties:
        attribute_filters:
          type: array
          items:
            $ref: '#/components/schemas/QueryCondition'
          description: >-
            The attribute filters to apply to the traces.
        limit:
          type: integer
          description: The limit of traces to return.
        offset:
          type: integer
          description: The offset of the traces to return.
        order_by:
          type: array
          items:
            type: string
          description: The order by of the traces to return.
      additionalProperties: false
      title: QueryTracesRequest
    Trace:
      type: object
      properties:
        trace_id:
          type: string
          description: Unique identifier for the trace
        root_span_id:
          type: string
          description: >-
            Unique identifier for the root span that started this trace
        start_time:
          type: string
          format: date-time
          description: Timestamp when the trace began
        end_time:
          type: string
          format: date-time
          description: >-
            (Optional) Timestamp when the trace finished, if completed
      additionalProperties: false
      required:
        - trace_id
        - root_span_id
        - start_time
      title: Trace
      description: >-
        A trace representing the complete execution path of a request across multiple
        operations.
    QueryTracesResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/Trace'
          description: >-
            List of traces matching the query criteria
      additionalProperties: false
      required:
        - data
      title: QueryTracesResponse
      description: Response containing a list of traces.
  responses:
    BadRequest400:
      description: The request was invalid or malformed
@ -4779,13 +4119,11 @@ tags:
  - name: Datasets
    description: ''
  - name: Eval
-    description: ''
+    description: >-
    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
    x-displayName: Evaluations
  - name: PostTraining (Coming Soon)
    description: ''
  - name: Telemetry
    description: ''
 x-tagGroups:
  - name: Operations
    tags:
@ -4795,4 +4133,3 @@ x-tagGroups:
      - Datasets
      - Eval
      - PostTraining (Coming Soon)
      - Telemetry
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -282,7 +282,7 @@
                    "Conversations"
                ],
                "summary": "Create a conversation.",
-                "description": "Create a conversation.",
+                "description": "Create a conversation.\nCreate a conversation.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -326,8 +326,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Get a conversation with the given ID.",
+                "summary": "Retrieve a conversation.",
-                "description": "Get a conversation with the given ID.",
+                "description": "Retrieve a conversation.\nGet a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -369,8 +369,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Update a conversation's metadata with the given ID.",
+                "summary": "Update a conversation.",
-                "description": "Update a conversation's metadata with the given ID.",
+                "description": "Update a conversation.\nUpdate a conversation's metadata with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -422,8 +422,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete a conversation with the given ID.",
+                "summary": "Delete a conversation.",
-                "description": "Delete a conversation with the given ID.",
+                "description": "Delete a conversation.\nDelete a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -467,8 +467,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "List items in the conversation.",
+                "summary": "List items.",
-                "description": "List items in the conversation.",
+                "description": "List items.\nList items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -597,8 +597,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Create items in the conversation.",
+                "summary": "Create items.",
-                "description": "Create items in the conversation.",
+                "description": "Create items.\nCreate items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -652,8 +652,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Retrieve a conversation item.",
+                "summary": "Retrieve an item.",
-                "description": "Retrieve a conversation item.",
+                "description": "Retrieve an item.\nRetrieve a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -704,8 +704,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete a conversation item.",
+                "summary": "Delete an item.",
-                "description": "Delete a conversation item.",
+                "description": "Delete an item.\nDelete a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -1833,7 +1833,7 @@
                "deprecated": false,
                "x-llama-stack-extra-body-params": [
                    {
-                        "name": "shields",
+                        "name": "guardrails",
                        "schema": {
                            "type": "array",
                            "items": {
@ -1842,12 +1842,12 @@
                                        "type": "string"
                                    },
                                    {
-                                        "$ref": "#/components/schemas/ResponseShieldSpec"
+                                        "$ref": "#/components/schemas/ResponseGuardrailSpec"
                                    }
                                ]
                            }
                        },
-                        "description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
+                        "description": "List of guardrails to apply during response generation. Guardrails provide safety and content moderation.",
                        "required": false
                    }
                ]
@ -2525,44 +2525,6 @@
                "deprecated": false
            }
        },
        "/v1/telemetry/events": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Telemetry"
                ],
                "summary": "Log an event.",
                "description": "Log an event.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/LogEventRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1/tool-runtime/invoke": {
            "post": {
                "responses": {
@ -5517,13 +5479,22 @@
                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                    },
                    {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
                    },
                    {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
@ -5536,9 +5507,12 @@
                    "propertyName": "type",
                    "mapping": {
                        "message": "#/components/schemas/OpenAIResponseMessage",
                        "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
                        "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
                        "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
                        "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
                        "function_call_output": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput",
                        "mcp_approval_request": "#/components/schemas/OpenAIResponseMCPApprovalRequest",
                        "mcp_approval_response": "#/components/schemas/OpenAIResponseMCPApprovalResponse",
                        "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
                        "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
                    }
@ -5696,6 +5670,58 @@
                    }
                }
            },
            "OpenAIResponseContentPartRefusal": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "refusal",
                        "default": "refusal",
                        "description": "Content part type identifier, always \"refusal\""
                    },
                    "refusal": {
                        "type": "string",
                        "description": "Refusal text supplied by the model"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "refusal"
                ],
                "title": "OpenAIResponseContentPartRefusal",
                "description": "Refusal content within a streamed response part."
            },
            "OpenAIResponseInputFunctionToolCallOutput": {
                "type": "object",
                "properties": {
                    "call_id": {
                        "type": "string"
                    },
                    "output": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "function_call_output",
                        "default": "function_call_output"
                    },
                    "id": {
                        "type": "string"
                    },
                    "status": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "call_id",
                    "output",
                    "type"
                ],
                "title": "OpenAIResponseInputFunctionToolCallOutput",
                "description": "This represents the output of a function call that gets passed back to the model."
            },
            "OpenAIResponseInputMessageContent": {
                "oneOf": [
                    {
@ -5775,6 +5801,68 @@
                "title": "OpenAIResponseInputMessageContentText",
                "description": "Text content for input messages in OpenAI response format."
            },
            "OpenAIResponseMCPApprovalRequest": {
                "type": "object",
                "properties": {
                    "arguments": {
                        "type": "string"
                    },
                    "id": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    },
                    "server_label": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "mcp_approval_request",
                        "default": "mcp_approval_request"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "arguments",
                    "id",
                    "name",
                    "server_label",
                    "type"
                ],
                "title": "OpenAIResponseMCPApprovalRequest",
                "description": "A request for human approval of a tool invocation."
            },
            "OpenAIResponseMCPApprovalResponse": {
                "type": "object",
                "properties": {
                    "approval_request_id": {
                        "type": "string"
                    },
                    "approve": {
                        "type": "boolean"
                    },
                    "type": {
                        "type": "string",
                        "const": "mcp_approval_response",
                        "default": "mcp_approval_response"
                    },
                    "id": {
                        "type": "string"
                    },
                    "reason": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "approval_request_id",
                    "approve",
                    "type"
                ],
                "title": "OpenAIResponseMCPApprovalResponse",
                "description": "A response to an MCP approval request."
            },
            "OpenAIResponseMessage": {
                "type": "object",
                "properties": {
@ -5839,6 +5927,23 @@
                "description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios."
            },
            "OpenAIResponseOutputMessageContent": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "output_text": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText",
                        "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                }
            },
            "OpenAIResponseOutputMessageContentOutputText": {
                "type": "object",
                "properties": {
                    "text": {
@ -6695,7 +6800,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -7250,41 +7355,17 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                    }
                ]
            },
            "OpenAIResponseInputFunctionToolCallOutput": {
                "type": "object",
                "properties": {
                    "call_id": {
                        "type": "string"
                    },
                    "output": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "function_call_output",
                        "default": "function_call_output"
                    },
                    "id": {
                        "type": "string"
                    },
                    "status": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "call_id",
                    "output",
                    "type"
                ],
                "title": "OpenAIResponseInputFunctionToolCallOutput",
                "description": "This represents the output of a function call that gets passed back to the model."
            },
            "OpenAIResponseInputToolFileSearch": {
                "type": "object",
                "properties": {
@ -7447,68 +7528,6 @@
                "title": "OpenAIResponseInputToolWebSearch",
                "description": "Web search tool configuration for OpenAI response inputs."
            },
            "OpenAIResponseMCPApprovalRequest": {
                "type": "object",
                "properties": {
                    "arguments": {
                        "type": "string"
                    },
                    "id": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    },
                    "server_label": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "mcp_approval_request",
                        "default": "mcp_approval_request"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "arguments",
                    "id",
                    "name",
                    "server_label",
                    "type"
                ],
                "title": "OpenAIResponseMCPApprovalRequest",
                "description": "A request for human approval of a tool invocation."
            },
            "OpenAIResponseMCPApprovalResponse": {
                "type": "object",
                "properties": {
                    "approval_request_id": {
                        "type": "string"
                    },
                    "approve": {
                        "type": "boolean"
                    },
                    "type": {
                        "type": "string",
                        "const": "mcp_approval_response",
                        "default": "mcp_approval_response"
                    },
                    "id": {
                        "type": "string"
                    },
                    "reason": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "approval_request_id",
                    "approve",
                    "type"
                ],
                "title": "OpenAIResponseMCPApprovalResponse",
                "description": "A response to an MCP approval request."
            },
            "OpenAIResponseObjectWithInput": {
                "type": "object",
                "properties": {
@ -7581,6 +7600,10 @@
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
                    },
                    "instructions": {
                        "type": "string",
                        "description": "(Optional) System message inserted into the model's context"
                    },
                    "input": {
                        "type": "array",
                        "items": {
@ -7834,20 +7857,20 @@
                "title": "OpenAIResponseUsage",
                "description": "Usage information for OpenAI response."
            },
-            "ResponseShieldSpec": {
+            "ResponseGuardrailSpec": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
-                        "description": "The type/identifier of the shield."
+                        "description": "The type/identifier of the guardrail."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
-                "title": "ResponseShieldSpec",
+                "title": "ResponseGuardrailSpec",
-                "description": "Specification for a shield to apply during response generation."
+                "description": "Specification for a guardrail to apply during response generation."
            },
            "OpenAIResponseInputTool": {
                "oneOf": [
@ -8129,6 +8152,10 @@
                    "usage": {
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
                    },
                    "instructions": {
                        "type": "string",
                        "description": "(Optional) System message inserted into the model's context"
                    }
                },
                "additionalProperties": false,
@ -8248,28 +8275,6 @@
                "title": "OpenAIResponseContentPartReasoningText",
                "description": "Reasoning text emitted as part of a streamed response."
            },
            "OpenAIResponseContentPartRefusal": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "refusal",
                        "default": "refusal",
                        "description": "Content part type identifier, always \"refusal\""
                    },
                    "refusal": {
                        "type": "string",
                        "description": "Refusal text supplied by the model"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "refusal"
                ],
                "title": "OpenAIResponseContentPartRefusal",
                "description": "Refusal content within a streamed response part."
            },
            "OpenAIResponseObjectStream": {
                "oneOf": [
                    {
@ -10200,7 +10205,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -10682,7 +10687,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -11172,354 +11177,6 @@
                "title": "SyntheticDataGenerationResponse",
                "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
            },
            "Event": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/UnstructuredLogEvent"
                    },
                    {
                        "$ref": "#/components/schemas/MetricEvent"
                    },
                    {
                        "$ref": "#/components/schemas/StructuredLogEvent"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "unstructured_log": "#/components/schemas/UnstructuredLogEvent",
                        "metric": "#/components/schemas/MetricEvent",
                        "structured_log": "#/components/schemas/StructuredLogEvent"
                    }
                }
            },
            "EventType": {
                "type": "string",
                "enum": [
                    "unstructured_log",
                    "structured_log",
                    "metric"
                ],
                "title": "EventType",
                "description": "The type of telemetry event being logged."
            },
            "LogSeverity": {
                "type": "string",
                "enum": [
                    "verbose",
                    "debug",
                    "info",
                    "warn",
                    "error",
                    "critical"
                ],
                "title": "LogSeverity",
                "description": "The severity level of a log message."
            },
            "MetricEvent": {
                "type": "object",
                "properties": {
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace this event belongs to"
                    },
                    "span_id": {
                        "type": "string",
                        "description": "Unique identifier for the span this event belongs to"
                    },
                    "timestamp": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the event occurred"
                    },
                    "attributes": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "integer"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        },
                        "description": "(Optional) Key-value pairs containing additional metadata about the event"
                    },
                    "type": {
                        "$ref": "#/components/schemas/EventType",
                        "const": "metric",
                        "default": "metric",
                        "description": "Event type identifier set to METRIC"
                    },
                    "metric": {
                        "type": "string",
                        "description": "The name of the metric being measured"
                    },
                    "value": {
                        "oneOf": [
                            {
                                "type": "integer"
                            },
                            {
                                "type": "number"
                            }
                        ],
                        "description": "The numeric value of the metric measurement"
                    },
                    "unit": {
                        "type": "string",
                        "description": "The unit of measurement for the metric value"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "trace_id",
                    "span_id",
                    "timestamp",
                    "type",
                    "metric",
                    "value",
                    "unit"
                ],
                "title": "MetricEvent",
                "description": "A metric event containing a measured value."
            },
            "SpanEndPayload": {
                "type": "object",
                "properties": {
                    "type": {
                        "$ref": "#/components/schemas/StructuredLogType",
                        "const": "span_end",
                        "default": "span_end",
                        "description": "Payload type identifier set to SPAN_END"
                    },
                    "status": {
                        "$ref": "#/components/schemas/SpanStatus",
                        "description": "The final status of the span indicating success or failure"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "status"
                ],
                "title": "SpanEndPayload",
                "description": "Payload for a span end event."
            },
            "SpanStartPayload": {
                "type": "object",
                "properties": {
                    "type": {
                        "$ref": "#/components/schemas/StructuredLogType",
                        "const": "span_start",
                        "default": "span_start",
                        "description": "Payload type identifier set to SPAN_START"
                    },
                    "name": {
                        "type": "string",
                        "description": "Human-readable name describing the operation this span represents"
                    },
                    "parent_span_id": {
                        "type": "string",
                        "description": "(Optional) Unique identifier for the parent span, if this is a child span"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "name"
                ],
                "title": "SpanStartPayload",
                "description": "Payload for a span start event."
            },
            "SpanStatus": {
                "type": "string",
                "enum": [
                    "ok",
                    "error"
                ],
                "title": "SpanStatus",
                "description": "The status of a span indicating whether it completed successfully or with an error."
            },
            "StructuredLogEvent": {
                "type": "object",
                "properties": {
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace this event belongs to"
                    },
                    "span_id": {
                        "type": "string",
                        "description": "Unique identifier for the span this event belongs to"
                    },
                    "timestamp": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the event occurred"
                    },
                    "attributes": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "integer"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        },
                        "description": "(Optional) Key-value pairs containing additional metadata about the event"
                    },
                    "type": {
                        "$ref": "#/components/schemas/EventType",
                        "const": "structured_log",
                        "default": "structured_log",
                        "description": "Event type identifier set to STRUCTURED_LOG"
                    },
                    "payload": {
                        "oneOf": [
                            {
                                "$ref": "#/components/schemas/SpanStartPayload"
                            },
                            {
                                "$ref": "#/components/schemas/SpanEndPayload"
                            }
                        ],
                        "discriminator": {
                            "propertyName": "type",
                            "mapping": {
                                "span_start": "#/components/schemas/SpanStartPayload",
                                "span_end": "#/components/schemas/SpanEndPayload"
                            }
                        },
                        "description": "The structured payload data for the log event"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "trace_id",
                    "span_id",
                    "timestamp",
                    "type",
                    "payload"
                ],
                "title": "StructuredLogEvent",
                "description": "A structured log event containing typed payload data."
            },
            "StructuredLogType": {
                "type": "string",
                "enum": [
                    "span_start",
                    "span_end"
                ],
                "title": "StructuredLogType",
                "description": "The type of structured log event payload."
            },
            "UnstructuredLogEvent": {
                "type": "object",
                "properties": {
                    "trace_id": {
                        "type": "string",
                        "description": "Unique identifier for the trace this event belongs to"
                    },
                    "span_id": {
                        "type": "string",
                        "description": "Unique identifier for the span this event belongs to"
                    },
                    "timestamp": {
                        "type": "string",
                        "format": "date-time",
                        "description": "Timestamp when the event occurred"
                    },
                    "attributes": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "integer"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        },
                        "description": "(Optional) Key-value pairs containing additional metadata about the event"
                    },
                    "type": {
                        "$ref": "#/components/schemas/EventType",
                        "const": "unstructured_log",
                        "default": "unstructured_log",
                        "description": "Event type identifier set to UNSTRUCTURED_LOG"
                    },
                    "message": {
                        "type": "string",
                        "description": "The log message text"
                    },
                    "severity": {
                        "$ref": "#/components/schemas/LogSeverity",
                        "description": "The severity level of the log message"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "trace_id",
                    "span_id",
                    "timestamp",
                    "type",
                    "message",
                    "severity"
                ],
                "title": "UnstructuredLogEvent",
                "description": "An unstructured log event containing a simple text message."
            },
            "LogEventRequest": {
                "type": "object",
                "properties": {
                    "event": {
                        "$ref": "#/components/schemas/Event",
                        "description": "The event to log."
                    },
                    "ttl_seconds": {
                        "type": "integer",
                        "description": "The time to live of the event."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "event",
                    "ttl_seconds"
                ],
                "title": "LogEventRequest"
            },
            "InvokeToolRequest": {
                "type": "object",
                "properties": {
@ -12083,7 +11740,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_db",
+                            "vector_store",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -13602,8 +13259,8 @@
        },
        {
            "name": "Conversations",
-            "description": "",
+            "description": "Protocol for conversation management operations.",
-            "x-displayName": "Protocol for conversation management operations."
+            "x-displayName": "Conversations"
        },
        {
            "name": "Files",
@ -13655,10 +13312,6 @@
            "name": "SyntheticDataGeneration (Coming Soon)",
            "description": ""
        },
        {
            "name": "Telemetry",
            "description": ""
        },
        {
            "name": "ToolGroups",
            "description": ""
@ -13689,7 +13342,6 @@
                "ScoringFunctions",
                "Shields",
                "SyntheticDataGeneration (Coming Soon)",
                "Telemetry",
                "ToolGroups",
                "ToolRuntime",
                "VectorIO"
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -192,7 +192,10 @@ paths:
      tags:
        - Conversations
      summary: Create a conversation.
-      description: Create a conversation.
+      description: >-
        Create a conversation.
        Create a conversation.
      parameters: []
      requestBody:
        content:
@ -222,8 +225,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Get a conversation with the given ID.
+      summary: Retrieve a conversation.
-      description: Get a conversation with the given ID.
+      description: >-
        Retrieve a conversation.
        Get a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -252,9 +258,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: >-
+      summary: Update a conversation.
        Update a conversation's metadata with the given ID.
      description: >-
        Update a conversation.
        Update a conversation's metadata with the given ID.
      parameters:
        - name: conversation_id
@ -290,8 +297,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete a conversation with the given ID.
+      summary: Delete a conversation.
-      description: Delete a conversation with the given ID.
+      description: >-
        Delete a conversation.
        Delete a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -321,8 +331,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: List items in the conversation.
+      summary: List items.
-      description: List items in the conversation.
+      description: >-
        List items.
        List items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -495,8 +508,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Create items in the conversation.
+      summary: Create items.
-      description: Create items in the conversation.
+      description: >-
        Create items.
        Create items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -532,8 +548,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Retrieve a conversation item.
+      summary: Retrieve an item.
-      description: Retrieve a conversation item.
+      description: >-
        Retrieve an item.
        Retrieve a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -568,8 +587,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete a conversation item.
+      summary: Delete an item.
-      description: Delete a conversation item.
+      description: >-
        Delete an item.
        Delete a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -1448,16 +1470,16 @@ paths:
        required: true
      deprecated: false
      x-llama-stack-extra-body-params:
-        - name: shields
+        - name: guardrails
          schema:
            type: array
            items:
              oneOf:
                - type: string
-                - $ref: '#/components/schemas/ResponseShieldSpec'
+                - $ref: '#/components/schemas/ResponseGuardrailSpec'
          description: >-
-            List of shields to apply during response generation. Shields provide safety
+            List of guardrails to apply during response generation. Guardrails provide
-            and content moderation.
+            safety and content moderation.
          required: false
  /v1/responses/{response_id}:
    get:
@ -1944,33 +1966,6 @@ paths:
              $ref: '#/components/schemas/SyntheticDataGenerateRequest'
        required: true
      deprecated: false
  /v1/telemetry/events:
    post:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Telemetry
      summary: Log an event.
      description: Log an event.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
      deprecated: false
  /v1/tool-runtime/invoke:
    post:
      responses:
@ -4180,18 +4175,24 @@ components:
    ConversationItem:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseMessage'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
      discriminator:
        propertyName: type
        mapping:
          message: '#/components/schemas/OpenAIResponseMessage'
          function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
          file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
          web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
          file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
          function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
          function_call_output: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
          mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
          mcp_approval_response: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
          mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
    OpenAIResponseAnnotationCitation:
@ -4312,6 +4313,50 @@ components:
          url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation'
          container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation'
          file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath'
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
          description: >-
            Content part type identifier, always "refusal"
        refusal:
          type: string
          description: Refusal text supplied by the model
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
      description: >-
        Refusal content within a streamed response part.
    "OpenAIResponseInputFunctionToolCallOutput":
      type: object
      properties:
        call_id:
          type: string
        output:
          type: string
        type:
          type: string
          const: function_call_output
          default: function_call_output
        id:
          type: string
        status:
          type: string
      additionalProperties: false
      required:
        - call_id
        - output
        - type
      title: >-
        OpenAIResponseInputFunctionToolCallOutput
      description: >-
        This represents the output of a function call that gets passed back to the
        model.
    OpenAIResponseInputMessageContent:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
@ -4370,6 +4415,53 @@ components:
      title: OpenAIResponseInputMessageContentText
      description: >-
        Text content for input messages in OpenAI response format.
    OpenAIResponseMCPApprovalRequest:
      type: object
      properties:
        arguments:
          type: string
        id:
          type: string
        name:
          type: string
        server_label:
          type: string
        type:
          type: string
          const: mcp_approval_request
          default: mcp_approval_request
      additionalProperties: false
      required:
        - arguments
        - id
        - name
        - server_label
        - type
      title: OpenAIResponseMCPApprovalRequest
      description: >-
        A request for human approval of a tool invocation.
    OpenAIResponseMCPApprovalResponse:
      type: object
      properties:
        approval_request_id:
          type: string
        approve:
          type: boolean
        type:
          type: string
          const: mcp_approval_response
          default: mcp_approval_response
        id:
          type: string
        reason:
          type: string
      additionalProperties: false
      required:
        - approval_request_id
        - approve
        - type
      title: OpenAIResponseMCPApprovalResponse
      description: A response to an MCP approval request.
    OpenAIResponseMessage:
      type: object
      properties:
@ -4411,6 +4503,15 @@ components:
        under one type because the Responses API gives them all the same "type" value,
        and there is no way to tell them apart in certain scenarios.
    OpenAIResponseOutputMessageContent:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
        - $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
      discriminator:
        propertyName: type
        mapping:
          output_text: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
          refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
    "OpenAIResponseOutputMessageContentOutputText":
      type: object
      properties:
        text:
@ -5126,7 +5227,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -5527,32 +5628,9 @@ components:
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    "OpenAIResponseInputFunctionToolCallOutput":
      type: object
      properties:
        call_id:
          type: string
        output:
          type: string
        type:
          type: string
          const: function_call_output
          default: function_call_output
        id:
          type: string
        status:
          type: string
      additionalProperties: false
      required:
        - call_id
        - output
        - type
      title: >-
        OpenAIResponseInputFunctionToolCallOutput
      description: >-
        This represents the output of a function call that gets passed back to the
        model.
    OpenAIResponseInputToolFileSearch:
      type: object
      properties:
@ -5669,53 +5747,6 @@ components:
      title: OpenAIResponseInputToolWebSearch
      description: >-
        Web search tool configuration for OpenAI response inputs.
    OpenAIResponseMCPApprovalRequest:
      type: object
      properties:
        arguments:
          type: string
        id:
          type: string
        name:
          type: string
        server_label:
          type: string
        type:
          type: string
          const: mcp_approval_request
          default: mcp_approval_request
      additionalProperties: false
      required:
        - arguments
        - id
        - name
        - server_label
        - type
      title: OpenAIResponseMCPApprovalRequest
      description: >-
        A request for human approval of a tool invocation.
    OpenAIResponseMCPApprovalResponse:
      type: object
      properties:
        approval_request_id:
          type: string
        approve:
          type: boolean
        type:
          type: string
          const: mcp_approval_response
          default: mcp_approval_response
        id:
          type: string
        reason:
          type: string
      additionalProperties: false
      required:
        - approval_request_id
        - approve
        - type
      title: OpenAIResponseMCPApprovalResponse
      description: A response to an MCP approval request.
    OpenAIResponseObjectWithInput:
      type: object
      properties:
@ -5784,6 +5815,10 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
        instructions:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        input:
          type: array
          items:
@ -5961,18 +5996,18 @@ components:
        - total_tokens
      title: OpenAIResponseUsage
      description: Usage information for OpenAI response.
-    ResponseShieldSpec:
+    ResponseGuardrailSpec:
      type: object
      properties:
        type:
          type: string
-          description: The type/identifier of the shield.
+          description: The type/identifier of the guardrail.
      additionalProperties: false
      required:
        - type
-      title: ResponseShieldSpec
+      title: ResponseGuardrailSpec
      description: >-
-        Specification for a shield to apply during response generation.
+        Specification for a guardrail to apply during response generation.
    OpenAIResponseInputTool:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -6187,6 +6222,10 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
        instructions:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
      additionalProperties: false
      required:
        - created_at
@ -6278,25 +6317,6 @@ components:
      title: OpenAIResponseContentPartReasoningText
      description: >-
        Reasoning text emitted as part of a streamed response.
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
          description: >-
            Content part type identifier, always "refusal"
        refusal:
          type: string
          description: Refusal text supplied by the model
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
      description: >-
        Refusal content within a streamed response part.
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -7899,7 +7919,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -8207,7 +8227,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -8565,267 +8585,6 @@ components:
      description: >-
        Response from the synthetic data generation. Batch of (prompt, response, score)
        tuples that pass the threshold.
    Event:
      oneOf:
        - $ref: '#/components/schemas/UnstructuredLogEvent'
        - $ref: '#/components/schemas/MetricEvent'
        - $ref: '#/components/schemas/StructuredLogEvent'
      discriminator:
        propertyName: type
        mapping:
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
    EventType:
      type: string
      enum:
        - unstructured_log
        - structured_log
        - metric
      title: EventType
      description: >-
        The type of telemetry event being logged.
    LogSeverity:
      type: string
      enum:
        - verbose
        - debug
        - info
        - warn
        - error
        - critical
      title: LogSeverity
      description: The severity level of a log message.
    MetricEvent:
      type: object
      properties:
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this event belongs to
        span_id:
          type: string
          description: >-
            Unique identifier for the span this event belongs to
        timestamp:
          type: string
          format: date-time
          description: Timestamp when the event occurred
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
          description: >-
            (Optional) Key-value pairs containing additional metadata about the event
        type:
          $ref: '#/components/schemas/EventType'
          const: metric
          default: metric
          description: Event type identifier set to METRIC
        metric:
          type: string
          description: The name of the metric being measured
        value:
          oneOf:
            - type: integer
            - type: number
          description: >-
            The numeric value of the metric measurement
        unit:
          type: string
          description: >-
            The unit of measurement for the metric value
      additionalProperties: false
      required:
        - trace_id
        - span_id
        - timestamp
        - type
        - metric
        - value
        - unit
      title: MetricEvent
      description: >-
        A metric event containing a measured value.
    SpanEndPayload:
      type: object
      properties:
        type:
          $ref: '#/components/schemas/StructuredLogType'
          const: span_end
          default: span_end
          description: Payload type identifier set to SPAN_END
        status:
          $ref: '#/components/schemas/SpanStatus'
          description: >-
            The final status of the span indicating success or failure
      additionalProperties: false
      required:
        - type
        - status
      title: SpanEndPayload
      description: Payload for a span end event.
    SpanStartPayload:
      type: object
      properties:
        type:
          $ref: '#/components/schemas/StructuredLogType'
          const: span_start
          default: span_start
          description: >-
            Payload type identifier set to SPAN_START
        name:
          type: string
          description: >-
            Human-readable name describing the operation this span represents
        parent_span_id:
          type: string
          description: >-
            (Optional) Unique identifier for the parent span, if this is a child span
      additionalProperties: false
      required:
        - type
        - name
      title: SpanStartPayload
      description: Payload for a span start event.
    SpanStatus:
      type: string
      enum:
        - ok
        - error
      title: SpanStatus
      description: >-
        The status of a span indicating whether it completed successfully or with
        an error.
    StructuredLogEvent:
      type: object
      properties:
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this event belongs to
        span_id:
          type: string
          description: >-
            Unique identifier for the span this event belongs to
        timestamp:
          type: string
          format: date-time
          description: Timestamp when the event occurred
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
          description: >-
            (Optional) Key-value pairs containing additional metadata about the event
        type:
          $ref: '#/components/schemas/EventType'
          const: structured_log
          default: structured_log
          description: >-
            Event type identifier set to STRUCTURED_LOG
        payload:
          oneOf:
            - $ref: '#/components/schemas/SpanStartPayload'
            - $ref: '#/components/schemas/SpanEndPayload'
          discriminator:
            propertyName: type
            mapping:
              span_start: '#/components/schemas/SpanStartPayload'
              span_end: '#/components/schemas/SpanEndPayload'
          description: >-
            The structured payload data for the log event
      additionalProperties: false
      required:
        - trace_id
        - span_id
        - timestamp
        - type
        - payload
      title: StructuredLogEvent
      description: >-
        A structured log event containing typed payload data.
    StructuredLogType:
      type: string
      enum:
        - span_start
        - span_end
      title: StructuredLogType
      description: >-
        The type of structured log event payload.
    UnstructuredLogEvent:
      type: object
      properties:
        trace_id:
          type: string
          description: >-
            Unique identifier for the trace this event belongs to
        span_id:
          type: string
          description: >-
            Unique identifier for the span this event belongs to
        timestamp:
          type: string
          format: date-time
          description: Timestamp when the event occurred
        attributes:
          type: object
          additionalProperties:
            oneOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
          description: >-
            (Optional) Key-value pairs containing additional metadata about the event
        type:
          $ref: '#/components/schemas/EventType'
          const: unstructured_log
          default: unstructured_log
          description: >-
            Event type identifier set to UNSTRUCTURED_LOG
        message:
          type: string
          description: The log message text
        severity:
          $ref: '#/components/schemas/LogSeverity'
          description: The severity level of the log message
      additionalProperties: false
      required:
        - trace_id
        - span_id
        - timestamp
        - type
        - message
        - severity
      title: UnstructuredLogEvent
      description: >-
        An unstructured log event containing a simple text message.
    LogEventRequest:
      type: object
      properties:
        event:
          $ref: '#/components/schemas/Event'
          description: The event to log.
        ttl_seconds:
          type: integer
          description: The time to live of the event.
      additionalProperties: false
      required:
        - event
        - ttl_seconds
      title: LogEventRequest
    InvokeToolRequest:
      type: object
      properties:
@ -9231,7 +8990,7 @@ components:
          enum:
            - model
            - shield
-            - vector_db
+            - vector_store
            - dataset
            - scoring_function
            - benchmark
@ -10417,9 +10176,9 @@ tags:
      - `background`
    x-displayName: Agents
  - name: Conversations
-    description: ''
+    description: >-
    x-displayName: >-
      Protocol for conversation management operations.
    x-displayName: Conversations
  - name: Files
    description: >-
      This API is used to upload documents that can be used with other Llama Stack
@ -10465,8 +10224,6 @@ tags:
    description: ''
  - name: SyntheticDataGeneration (Coming Soon)
    description: ''
  - name: Telemetry
    description: ''
  - name: ToolGroups
    description: ''
  - name: ToolRuntime
@ -10489,7 +10246,6 @@ x-tagGroups:
      - ScoringFunctions
      - Shields
      - SyntheticDataGeneration (Coming Soon)
      - Telemetry
      - ToolGroups
      - ToolRuntime
      - VectorIO
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -78,17 +78,14 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 ## Build, Configure, and Run Llama Stack
-1. **Build the Llama Stack**:
+1. **Install dependencies**:
   Build the Llama Stack using the `starter` template:
   ```bash
-   uv run --with llama-stack llama stack build --distro starter --image-type venv
+   llama stack list-deps starter | xargs -L1 uv pip install
   ```
-   **Expected Output:**
+
 2. **Start the distribution**:
   ```bash
-   ...
+   llama stack run starter
   Build Successful!
   You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
   You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter
   ```
 3. **Set the ENV variables by exporting them to the terminal**:
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -43,17 +43,17 @@ from .openai_responses import (
@json_schema_type
-class ResponseShieldSpec(BaseModel):
+class ResponseGuardrailSpec(BaseModel):
-    """Specification for a shield to apply during response generation.
+    """Specification for a guardrail to apply during response generation.
-    :param type: The type/identifier of the shield.
+    :param type: The type/identifier of the guardrail.
    """
    type: str
-    # TODO: more fields to be added for shield configuration
+    # TODO: more fields to be added for guardrail configuration
-ResponseShield = str | ResponseShieldSpec
+ResponseGuardrail = str | ResponseGuardrailSpec
 class Attachment(BaseModel):
@ -820,10 +820,10 @@ class Agents(Protocol):
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
-        shields: Annotated[
+        guardrails: Annotated[
-            list[ResponseShield] | None,
+            list[ResponseGuardrail] | None,
            ExtraBodyField(
-                "List of shields to apply during response generation. Shields provide safety and content moderation."
+                "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
            ),
        ] = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
@ -834,7 +834,7 @@ class Agents(Protocol):
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
-        :param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications.
+        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -131,8 +131,20 @@ class OpenAIResponseOutputMessageContentOutputText(BaseModel):
    annotations: list[OpenAIResponseAnnotations] = Field(default_factory=list)
@json_schema_type
 class OpenAIResponseContentPartRefusal(BaseModel):
    """Refusal content within a streamed response part.
    :param type: Content part type identifier, always "refusal"
    :param refusal: Refusal text supplied by the model
    """
    type: Literal["refusal"] = "refusal"
    refusal: str
 OpenAIResponseOutputMessageContent = Annotated[
-    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageContentOutputText | OpenAIResponseContentPartRefusal,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
@ -533,6 +545,7 @@ class OpenAIResponseObject(BaseModel):
    :param tools: (Optional) An array of tools the model may call while generating a response.
    :param truncation: (Optional) Truncation strategy applied to the response
    :param usage: (Optional) Token usage information for the response
    :param instructions: (Optional) System message inserted into the model's context
    """
    created_at: int
@ -552,6 +565,7 @@ class OpenAIResponseObject(BaseModel):
    tools: list[OpenAIResponseTool] | None = None
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
@json_schema_type
@ -878,18 +892,6 @@ class OpenAIResponseContentPartOutputText(BaseModel):
    logprobs: list[dict[str, Any]] | None = None
@json_schema_type
 class OpenAIResponseContentPartRefusal(BaseModel):
    """Refusal content within a streamed response part.
    :param type: Content part type identifier, always "refusal"
    :param refusal: Refusal text supplied by the model
    """
    type: Literal["refusal"] = "refusal"
    refusal: str
@json_schema_type
 class OpenAIResponseContentPartReasoningText(BaseModel):
    """Reasoning text emitted as part of a streamed response.
@ -1258,9 +1260,9 @@ OpenAIResponseInput = Annotated[
    | OpenAIResponseInputFunctionToolCallOutput
    | OpenAIResponseMCPApprovalRequest
    | OpenAIResponseMCPApprovalResponse
-    |
+    | OpenAIResponseOutputMessageMCPCall
-    # Fallback to the generic message type as a last resort
+    | OpenAIResponseOutputMessageMCPListTools
-    OpenAIResponseMessage,
+    | OpenAIResponseMessage,
    Field(union_mode="left_to_right"),
 ]
 register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
--- a/llama_stack/apis/conversations/conversations.py
+++ b/llama_stack/apis/conversations/conversations.py
@ -12,6 +12,9 @@ from openai.types.responses.response_includable import ResponseIncludable
 from pydantic import BaseModel, Field
 from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseInputFunctionToolCallOutput,
    OpenAIResponseMCPApprovalRequest,
    OpenAIResponseMCPApprovalResponse,
    OpenAIResponseMessage,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFunctionToolCall,
@ -61,9 +64,14 @@ class ConversationMessage(BaseModel):
 ConversationItem = Annotated[
    OpenAIResponseMessage
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseOutputMessageFileSearchToolCall
    | OpenAIResponseOutputMessageWebSearchToolCall
    | OpenAIResponseOutputMessageFileSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseInputFunctionToolCallOutput
    | OpenAIResponseMCPApprovalRequest
    | OpenAIResponseMCPApprovalResponse
    | OpenAIResponseOutputMessageMCPCall
    | OpenAIResponseOutputMessageMCPListTools
    | OpenAIResponseOutputMessageMCPCall
    | OpenAIResponseOutputMessageMCPListTools,
    Field(discriminator="type"),
@ -165,7 +173,9 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable
@trace_protocol
 class Conversations(Protocol):
-    """Protocol for conversation management operations."""
+    """Conversations
    Protocol for conversation management operations."""
    @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
    async def create_conversation(
@ -173,6 +183,8 @@ class Conversations(Protocol):
    ) -> Conversation:
        """Create a conversation.
        Create a conversation.
        :param items: Initial items to include in the conversation context.
        :param metadata: Set of key-value pairs that can be attached to an object.
        :returns: The created conversation object.
@ -181,7 +193,9 @@ class Conversations(Protocol):
    @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_conversation(self, conversation_id: str) -> Conversation:
-        """Get a conversation with the given ID.
+        """Retrieve a conversation.
        Get a conversation with the given ID.
        :param conversation_id: The conversation identifier.
        :returns: The conversation object.
@ -190,7 +204,9 @@ class Conversations(Protocol):
    @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
-        """Update a conversation's metadata with the given ID.
+        """Update a conversation.
        Update a conversation's metadata with the given ID.
        :param conversation_id: The conversation identifier.
        :param metadata: Set of key-value pairs that can be attached to an object.
@ -200,7 +216,9 @@ class Conversations(Protocol):
    @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
-        """Delete a conversation with the given ID.
+        """Delete a conversation.
        Delete a conversation with the given ID.
        :param conversation_id: The conversation identifier.
        :returns: The deleted conversation resource.
@ -209,7 +227,9 @@ class Conversations(Protocol):
    @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
-        """Create items in the conversation.
+        """Create items.
        Create items in the conversation.
        :param conversation_id: The conversation identifier.
        :param items: Items to include in the conversation context.
@ -219,7 +239,9 @@ class Conversations(Protocol):
    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
-        """Retrieve a conversation item.
+        """Retrieve an item.
        Retrieve a conversation item.
        :param conversation_id: The conversation identifier.
        :param item_id: The item identifier.
@ -236,7 +258,9 @@ class Conversations(Protocol):
        limit: int | NotGiven = NOT_GIVEN,
        order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
    ) -> ConversationItemList:
-        """List items in the conversation.
+        """List items.
        List items in the conversation.
        :param conversation_id: The conversation identifier.
        :param after: An item ID to list items after, used in pagination.
@ -251,7 +275,9 @@ class Conversations(Protocol):
    async def openai_delete_conversation_item(
        self, conversation_id: str, item_id: str
    ) -> ConversationItemDeletedResource:
-        """Delete a conversation item.
+        """Delete an item.
        Delete a conversation item.
        :param conversation_id: The conversation identifier.
        :param item_id: The item identifier.
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -121,6 +121,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    models = "models"
    shields = "shields"
    vector_stores = "vector_stores"  # only used for routing table
    datasets = "datasets"
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -82,7 +82,9 @@ class EvaluateResponse(BaseModel):
 class Eval(Protocol):
-    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""
+    """Evaluations
    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 class ResourceType(StrEnum):
    model = "model"
    shield = "shield"
-    vector_db = "vector_db"
+    vector_store = "vector_store"
    dataset = "dataset"
    scoring_function = "scoring_function"
    benchmark = "benchmark"
@ -34,4 +34,4 @@ class Resource(BaseModel):
    provider_id: str = Field(description="ID of the provider that owns this resource")
-    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")
+    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`tests//recordings/ linguist-generated=true`