Release candidate 0.3.0rc1

2025-10-23 08:33:09 +00:00 · 2025-10-17 02:34:58 +00:00
350 changed files with 19099 additions and 96944 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,19 +0,0 @@
-.venv
-__pycache__
-*.pyc
-*.pyo
-*.pyd
-*.so
-.git
-.gitignore
-htmlcov*
-.coverage
-coverage*
-.cache
-.mypy_cache
-.pytest_cache
-.ruff_cache
-uv.lock
-node_modules
-build
-/tmp
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -82,13 +82,11 @@ runs:
          echo "No recording changes"
        fi

-    - name: Write docker logs to file
+    - name: Write inference logs to file
      if: ${{ always() }}
      shell: bash
      run: |
-        # Ollama logs (if ollama container exists)
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
-        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true

    - name: Upload logs
      if: ${{ always() }}
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -57,7 +57,7 @@ runs:
        echo "Building Llama Stack"

        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
+          uv run --no-sync llama stack build --template ci-tests --image-type venv

    - name: Configure git for commits
      shell: bash
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -14,7 +14,6 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
-| Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -30,11 +30,8 @@ jobs:

      - name: Build a single provider
        run: |
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=starter \
-            --tag llama-stack:starter-ci
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
+            llama stack build --template starter --image-type container --image-name test

      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -73,24 +73,6 @@ jobs:
          image_name: kube
          apis: []
          providers: {}
-          storage:
-            backends:
-              kv_default:
-                type: kv_sqlite
-                db_path: $run_dir/kvstore.db
-              sql_default:
-                type: sql_sqlite
-                db_path: $run_dir/sql_store.db
-            stores:
-              metadata:
-                namespace: registry
-                backend: kv_default
-              inference:
-                table_name: inference_store
-                backend: sql_default
-              conversations:
-                table_name: openai_conversations
-                backend: sql_default
          server:
            port: 8321
          EOF
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -47,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, docker]
+        client-type: [library, server]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
@ -61,7 +61,7 @@ jobs:
              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
            || github.event.inputs.test-setup == 'ollama-vision'
              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
-            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}, {"setup": "gpt", "suite": "responses"}]')
+            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
          }}

    steps:
@ -82,7 +82,7 @@ jobs:
        env:
          OPENAI_API_KEY: dummy
        with:
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -144,7 +144,7 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
+          uv run --no-sync llama stack build --template ci-tests --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -169,7 +169,9 @@ jobs:
        run: |
          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io
+            tests/integration/vector_io \
+            --embedding-model inline::sentence-transformers/nomic-ai/nomic-embed-text-v1.5 \
+            --embedding-dimension 768

      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml

      - name: Set up Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -99,7 +99,7 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
+              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
            });

      - name: Checkout PR branch (same-repo)
@ -141,7 +141,7 @@ jobs:

      - name: Set up Node.js
        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -14,8 +14,6 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
-      - 'containers/Containerfile'
-      - '.dockerignore'

  pull_request:
    paths:
@ -26,8 +24,6 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
-      - 'containers/Containerfile'
-      - '.dockerignore'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -64,19 +60,15 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Install distribution into venv
-        if: matrix.image-type == 'venv'
+      - name: Print build dependencies
        run: |
-          uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
+          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only

-      - name: Build container image
-        if: matrix.image-type == 'container'
+      - name: Run Llama Stack Build
        run: |
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=${{ matrix.distro }} \
-            --tag llama-stack:${{ matrix.distro }}-ci
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test

      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -94,8 +86,8 @@ jobs:

      - name: Build a single provider
        run: |
-          uv pip install -e .
-          uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
+
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
@ -105,16 +97,11 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Build container image
+      - name: Build a single provider
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-            -t llama-stack:ci-tests
+          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
+          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml

      - name: Inspect the container image entrypoint
        run: |
@ -125,7 +112,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -142,19 +129,17 @@ jobs:
      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
+            .image_type    = "container" |
+            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/distributions/ci-tests/build.yaml

-      - name: Build UBI9 container image
+      - name: Build dev container (UBI9)
+        env:
+          USE_COPY_NOT_MOUNT: "true"
+          LLAMA_STACK_DIR: "."
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-            -t llama-stack:ci-tests-ubi9
+          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml

      - name: Inspect UBI9 image
        run: |
@ -165,7 +150,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -1,105 +0,0 @@
-name: Test llama stack list-deps
-
-run-name: Test llama stack list-deps
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
-      - 'pyproject.toml'
-
-  pull_request:
-    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
-      - 'pyproject.toml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      distros: ${{ steps.set-matrix.outputs.distros }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Generate Distribution List
-        id: set-matrix
-        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "distros=$distros" >> "$GITHUB_OUTPUT"
-
-  list-deps:
-    needs: generate-matrix
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
-        image-type: [venv, container]
-      fail-fast: false # We want to run all jobs even if some fail
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Print dependencies
-        run: |
-          uv run llama stack list-deps ${{ matrix.distro }}
-
-      - name: Install Distro using llama stack list-deps
-        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
-
-      - name: Print dependencies in the image
-        if: matrix.image-type == 'venv'
-        run: |
-          uv pip list
-
-  show-single-provider:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Show a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
-
-  list-deps-from-config:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: list-des from Config
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
-        run: |
-          uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
+      uses: astral-sh/setup-uv@eb1897b8dc4b5d5bfe39a428a8f2304605e0983c # v7.0.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -46,9 +46,9 @@ jobs:
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
          cat tests/external/ramalama-stack/run.yaml

-      - name: Install distribution dependencies
+      - name: Build distro from config file
        run: |
-          uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -44,14 +44,11 @@ jobs:

      - name: Print distro dependencies
        run: |
-          uv run --no-sync llama stack list-deps tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only

      - name: Build distro from config file
        run: |
-          uv venv ci-test
-          source ci-test/bin/activate
-          uv pip install -e .
-          LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

      - name: Setup Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -167,9 +167,9 @@ under the LICENSE file in the root directory of this source tree.

 Some tips about common tasks you work on while contributing to Llama Stack:

-### Installing dependencies of distributions
+### Using `llama stack build`

-When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.
+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.

 Example:
 ```bash
@ -177,12 +177,7 @@ cd work/
 git clone https://github.com/llamastack/llama-stack.git
 git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
-
-# Show dependencies for a distribution
-llama stack list-deps <distro-name>
-
-# Install dependencies
-llama stack list-deps <distro-name> | xargs -L1 uv pip install
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```

 ### Updating distribution configurations
--- a/README.md
+++ b/README.md
@ -27,11 +27,8 @@ MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
 huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL

-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-
 # start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
+INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu

 # install client to interact with the server
 pip install llama-stack-client
@ -92,7 +89,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:

 ```bash
-curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
 ```

 ### Overview
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -98,30 +98,21 @@ data:
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
-    storage:
-      backends:
-        kv_default:
-          type: kv_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-        sql_default:
-          type: sql_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
-        metadata:
-          backend: kv_default
-          namespace: registry
-        inference:
-          backend: sql_default
-          table_name: inference_store
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
    models:
    - metadata:
        embedding_dimension: 768
@ -146,4 +137,5 @@ data:
      port: 8323
 kind: ConfigMap
 metadata:
+  creationTimestamp: null
  name: llama-stack-config
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -27,24 +27,28 @@ providers:
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
+      kvstore:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -54,15 +58,20 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -86,62 +95,40 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    sql_default:
-      type: sql_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-registered_resources:
-  models:
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  - model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-    model_type: llm
-  shields:
-  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+models:
+- metadata:
+    embedding_dimension: 768
+  model_id: nomic-embed-text-v1.5
+  provider_id: sentence-transformers
+  model_type: embedding
+- model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
 server:
  port: 8323
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: chromadb
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -1,8 +0,0 @@
-These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
-
- `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
-
-A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
-
-These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -1,610 +0,0 @@
-# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
-
-organization:
-  # Name of your organization or company, used to determine the name of the client
-  # and headings.
-  name: llama-stack-client
-  docs: https://llama-stack.readthedocs.io/en/latest/
-  contact: llamastack@meta.com
-security:
-  - {}
-  - BearerAuth: []
-security_schemes:
-  BearerAuth:
-    type: http
-    scheme: bearer
-# `targets` define the output targets and their customization options, such as
-# whether to emit the Node SDK and what it's package name should be.
-targets:
-  node:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-typescript
-    publish:
-      npm: false
-  python:
-    package_name: llama_stack_client
-    production_repo: llamastack/llama-stack-client-python
-    options:
-      use_uv: true
-    publish:
-      pypi: true
-    project_name: llama_stack_client
-  kotlin:
-    reverse_domain: com.llama_stack_client.api
-    production_repo: null
-    publish:
-      maven: false
-  go:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-go
-    options:
-      enable_v2: true
-      back_compat_use_shared_package: false
-
-# `client_settings` define settings for the API client, such as extra constructor
-# arguments (used for authentication), retry behavior, idempotency, etc.
-client_settings:
-  default_env_prefix: LLAMA_STACK_CLIENT
-  opts:
-    api_key:
-      type: string
-      read_env: LLAMA_STACK_CLIENT_API_KEY
-      auth: { security_scheme: BearerAuth }
-      nullable: true
-
-# `environments` are a map of the name of the environment (e.g. "sandbox",
-# "production") to the corresponding url to use.
-environments:
-  production: http://any-hosted-llama-stack.com
-
-# `pagination` defines [pagination schemes] which provides a template to match
-# endpoints and generate next-page and auto-pagination helpers in the SDKs.
-pagination:
-  - name: datasets_iterrows
-    type: offset
-    request:
-      dataset_id:
-        type: string
-      start_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_param
-      limit:
-        type: integer
-    response:
-      data:
-        type: array
-        items:
-          type: object
-      next_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_start_field
-  - name: openai_cursor_page
-    type: cursor
-    request:
-      limit:
-        type: integer
-      after:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_param
-    response:
-      data:
-        type: array
-        items: {}
-      has_more:
-        type: boolean
-      last_id:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_field
-# `resources` define the structure and organziation for your API, such as how
-# methods and models are grouped together and accessed. See the [configuration
-# guide] for more information.
-#
-# [configuration guide]:
-#   https://app.stainlessapi.com/docs/guides/configure#resources
-resources:
-  $shared:
-    models:
-      agent_config: AgentConfig
-      interleaved_content_item: InterleavedContentItem
-      interleaved_content: InterleavedContent
-      param_type: ParamType
-      safety_violation: SafetyViolation
-      sampling_params: SamplingParams
-      scoring_result: ScoringResult
-      message: Message
-      user_message: UserMessage
-      completion_message: CompletionMessage
-      tool_response_message: ToolResponseMessage
-      system_message: SystemMessage
-      tool_call: ToolCall
-      query_result: RAGQueryResult
-      document: RAGDocument
-      query_config: RAGQueryConfig
-      response_format: ResponseFormat
-  toolgroups:
-    models:
-      tool_group: ToolGroup
-      list_tool_groups_response: ListToolGroupsResponse
-    methods:
-      register: post /v1/toolgroups
-      get: get /v1/toolgroups/{toolgroup_id}
-      list: get /v1/toolgroups
-      unregister: delete /v1/toolgroups/{toolgroup_id}
-  tools:
-    methods:
-      get: get /v1/tools/{tool_name}
-      list:
-        endpoint: get /v1/tools
-        paginated: false
-
-  tool_runtime:
-    models:
-      tool_def: ToolDef
-      tool_invocation_result: ToolInvocationResult
-    methods:
-      list_tools:
-        endpoint: get /v1/tool-runtime/list-tools
-        paginated: false
-      invoke_tool: post /v1/tool-runtime/invoke
-    subresources:
-      rag_tool:
-        methods:
-          insert: post /v1/tool-runtime/rag-tool/insert
-          query: post /v1/tool-runtime/rag-tool/query
-
-  responses:
-    models:
-      response_object_stream: OpenAIResponseObjectStream
-      response_object: OpenAIResponseObject
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/responses
-        streaming:
-          stream_event_model: responses.response_object_stream
-          param_discriminator: stream
-      retrieve: get /v1/responses/{response_id}
-      list:
-        type: http
-        endpoint: get /v1/responses
-      delete:
-        type: http
-        endpoint: delete /v1/responses/{response_id}
-    subresources:
-      input_items:
-        methods:
-          list:
-            type: http
-            endpoint: get /v1/responses/{response_id}/input_items
-
-  conversations:
-    models:
-      conversation_object: Conversation
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/conversations
-      retrieve: get /v1/conversations/{conversation_id}
-      update:
-        type: http
-        endpoint: post /v1/conversations/{conversation_id}
-      delete:
-        type: http
-        endpoint: delete /v1/conversations/{conversation_id}
-    subresources:
-      items:
-        methods:
-          get:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
-          list:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items
-          create:
-            type: http
-            endpoint: post /v1/conversations/{conversation_id}/items
-
-  inspect:
-    models:
-      healthInfo: HealthInfo
-      providerInfo: ProviderInfo
-      routeInfo: RouteInfo
-      versionInfo: VersionInfo
-    methods:
-      health: get /v1/health
-      version: get /v1/version
-
-  embeddings:
-    models:
-      create_embeddings_response: OpenAIEmbeddingsResponse
-    methods:
-      create: post /v1/embeddings
-
-  chat:
-    models:
-      chat_completion_chunk: OpenAIChatCompletionChunk
-    subresources:
-      completions:
-        methods:
-          create:
-            type: http
-            endpoint: post /v1/chat/completions
-            streaming:
-              stream_event_model: chat.chat_completion_chunk
-              param_discriminator: stream
-          list:
-            type: http
-            endpoint: get /v1/chat/completions
-          retrieve:
-            type: http
-            endpoint: get /v1/chat/completions/{completion_id}
-  completions:
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/completions
-        streaming:
-          param_discriminator: stream
-
-  vector_io:
-    models:
-      queryChunksResponse: QueryChunksResponse
-    methods:
-      insert: post /v1/vector-io/insert
-      query: post /v1/vector-io/query
-
-  vector_stores:
-    models:
-      vector_store: VectorStoreObject
-      list_vector_stores_response: VectorStoreListResponse
-      vector_store_delete_response: VectorStoreDeleteResponse
-      vector_store_search_response: VectorStoreSearchResponsePage
-    methods:
-      create: post /v1/vector_stores
-      list:
-        endpoint: get /v1/vector_stores
-      retrieve: get /v1/vector_stores/{vector_store_id}
-      update: post /v1/vector_stores/{vector_store_id}
-      delete: delete /v1/vector_stores/{vector_store_id}
-      search: post /v1/vector_stores/{vector_store_id}/search
-    subresources:
-      files:
-        models:
-          vector_store_file: VectorStoreFileObject
-        methods:
-          list: get /v1/vector_stores/{vector_store_id}/files
-          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
-          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
-          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
-          create: post /v1/vector_stores/{vector_store_id}/files
-          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
-      file_batches:
-        models:
-          vector_store_file_batches: VectorStoreFileBatchObject
-          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
-        methods:
-          create: post /v1/vector_stores/{vector_store_id}/file_batches
-          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
-          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
-          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
-
-  models:
-    models:
-      model: Model
-      list_models_response: ListModelsResponse
-    methods:
-      retrieve: get /v1/models/{model_id}
-      list:
-        endpoint: get /v1/models
-        paginated: false
-      register: post /v1/models
-      unregister: delete /v1/models/{model_id}
-    subresources:
-      openai:
-        methods:
-          list:
-            endpoint: get /v1/models
-            paginated: false
-
-  providers:
-    models:
-      list_providers_response: ListProvidersResponse
-    methods:
-      list:
-        endpoint: get /v1/providers
-        paginated: false
-      retrieve: get /v1/providers/{provider_id}
-
-  routes:
-    models:
-      list_routes_response: ListRoutesResponse
-    methods:
-      list:
-        endpoint: get /v1/inspect/routes
-        paginated: false
-
-
-  moderations:
-    models:
-      create_response: ModerationObject
-    methods:
-      create: post /v1/moderations
-
-
-  safety:
-    models:
-      run_shield_response: RunShieldResponse
-    methods:
-      run_shield: post /v1/safety/run-shield
-
-
-  shields:
-    models:
-      shield: Shield
-      list_shields_response: ListShieldsResponse
-    methods:
-      retrieve: get /v1/shields/{identifier}
-      list:
-        endpoint: get /v1/shields
-        paginated: false
-      register: post /v1/shields
-      delete: delete /v1/shields/{identifier}
-
-  synthetic_data_generation:
-    models:
-      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
-    methods:
-      generate: post /v1/synthetic-data-generation/generate
-
-  telemetry:
-    models:
-      span_with_status: SpanWithStatus
-      trace: Trace
-      query_spans_response: QuerySpansResponse
-      event: Event
-      query_condition: QueryCondition
-    methods:
-      query_traces:
-        endpoint: post /v1alpha/telemetry/traces
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
-      query_spans:
-        endpoint: post /v1alpha/telemetry/spans
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      query_metrics:
-        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      # log_event: post /v1alpha/telemetry/events
-      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
-      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
-      get_trace: get /v1alpha/telemetry/traces/{trace_id}
-
-  scoring:
-    methods:
-      score: post /v1/scoring/score
-      score_batch: post /v1/scoring/score-batch
-  scoring_functions:
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        endpoint: get /v1/scoring-functions
-        paginated: false
-      register: post /v1/scoring-functions
-    models:
-      scoring_fn: ScoringFn
-      scoring_fn_params: ScoringFnParams
-      list_scoring_functions_response: ListScoringFunctionsResponse
-
-  benchmarks:
-    methods:
-      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-      list:
-        endpoint: get /v1alpha/eval/benchmarks
-        paginated: false
-      register: post /v1alpha/eval/benchmarks
-    models:
-      benchmark: Benchmark
-      list_benchmarks_response: ListBenchmarksResponse
-
-  files:
-    methods:
-      create: post /v1/files
-      list: get /v1/files
-      retrieve: get /v1/files/{file_id}
-      delete: delete /v1/files/{file_id}
-      content: get /v1/files/{file_id}/content
-    models:
-      file: OpenAIFileObject
-      list_files_response: ListOpenAIFileResponse
-      delete_file_response: OpenAIFileDeleteResponse
-
-  alpha:
-    subresources:
-      inference:
-        methods:
-          rerank: post /v1alpha/inference/rerank
-
-      post_training:
-        models:
-          algorithm_config: AlgorithmConfig
-          post_training_job: PostTrainingJob
-          list_post_training_jobs_response: ListPostTrainingJobsResponse
-        methods:
-          preference_optimize: post /v1alpha/post-training/preference-optimize
-          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
-        subresources:
-          job:
-            methods:
-              artifacts: get /v1alpha/post-training/job/artifacts
-              cancel: post /v1alpha/post-training/job/cancel
-              status: get /v1alpha/post-training/job/status
-              list:
-                endpoint: get /v1alpha/post-training/jobs
-                paginated: false
-
-      eval:
-        methods:
-          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-
-        subresources:
-          jobs:
-            methods:
-              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-
-      agents:
-        methods:
-          create: post /v1alpha/agents
-          list: get /v1alpha/agents
-          retrieve: get /v1alpha/agents/{agent_id}
-          delete: delete /v1alpha/agents/{agent_id}
-        models:
-          inference_step: InferenceStep
-          tool_execution_step: ToolExecutionStep
-          tool_response: ToolResponse
-          shield_call_step: ShieldCallStep
-          memory_retrieval_step: MemoryRetrievalStep
-        subresources:
-          session:
-            models:
-              session: Session
-            methods:
-              list: get /v1alpha/agents/{agent_id}/sessions
-              create: post /v1alpha/agents/{agent_id}/session
-              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
-          steps:
-            methods:
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
-          turn:
-            models:
-              turn: Turn
-              turn_response_event: AgentTurnResponseEvent
-              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
-            methods:
-              create:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
-              resume:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-
-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            endpoint: get /v1beta/datasets
-            paginated: false
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-
-
-settings:
-  license: MIT
-  unwrap_response_fields: [ data ]
-
-openapi:
-  transformations:
-    - command: renameValue
-      reason: pydantic reserved name
-      args:
-        filter:
-          only:
-            - '$.components.schemas.InferenceStep.properties.model_response'
-        rename:
-          python:
-            property_name: 'inference_model_response'
-
-    # - command: renameValue
-    #   reason: pydantic reserved name
-    #   args:
-    #     filter:
-    #       only:
-    #         - '$.components.schemas.Model.properties.model_type'
-    #     rename:
-    #       python:
-    #         property_name: 'type'
-    - command: mergeObject
-      reason: Better return_type using enum
-      args:
-        target:
-          - '$.components.schemas'
-        object:
-          ReturnType:
-            additionalProperties: false
-            properties:
-              type:
-                enum:
-                  - string
-                  - number
-                  - boolean
-                  - array
-                  - object
-                  - json
-                  - union
-                  - chat_completion_input
-                  - completion_input
-                  - agent_turn_input
-            required:
-              - type
-            type: object
-    - command: replaceProperties
-      reason: Replace return type properties with better model (see above)
-      args:
-        filter:
-          only:
-            - '$.components.schemas.ScoringFn.properties.return_type'
-            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
-        value:
-          $ref: '#/components/schemas/ReturnType'
-    - command: oneOfToAnyOf
-      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
-    - reason: For better names
-      command: extractToRefs
-      args:
-        ref:
-          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
-          name: '#/components/schemas/ToolCallOrString'
-
-# `readme` is used to configure the code snippets that will be rendered in the
-# README.md of various SDKs. In particular, you can change the `headline`
-# snippet's endpoint and the arguments to call it with.
-readme:
-  example_requests:
-    default:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: &ref_0 {}
-    headline:
-      type: request
-      endpoint: post /v1/models
-      params: *ref_0
-    pagination:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -1,137 +0,0 @@
-# syntax=docker/dockerfile:1.6
-#
-# This Dockerfile is used to build the Llama Stack container image.
-# Example:
-# docker build \
-#   -f containers/Containerfile \
-#   --build-arg DISTRO_NAME=starter \
-#   --tag llama-stack:starter .
-
-ARG BASE_IMAGE=python:3.12-slim
-FROM ${BASE_IMAGE}
-
-ARG INSTALL_MODE="pypi"
-ARG LLAMA_STACK_DIR="/workspace"
-ARG LLAMA_STACK_CLIENT_DIR=""
-ARG PYPI_VERSION=""
-ARG TEST_PYPI_VERSION=""
-ARG KEEP_WORKSPACE=""
-ARG DISTRO_NAME="starter"
-ARG RUN_CONFIG_PATH=""
-ARG UV_HTTP_TIMEOUT=500
-ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PIP_DISABLE_PIP_VERSION_CHECK=1
-WORKDIR /app
-
-RUN set -eux; \
-    if command -v dnf >/dev/null 2>&1; then \
-        dnf -y update && \
-        dnf install -y iputils git net-tools wget \
-            vim-minimal python3.12 python3.12-pip python3.12-wheel \
-            python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
-        ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
-        ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
-        dnf clean all; \
-    elif command -v apt-get >/dev/null 2>&1; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            iputils-ping net-tools iproute2 dnsutils telnet \
-            curl wget git procps psmisc lsof traceroute bubblewrap \
-            gcc g++ && \
-        rm -rf /var/lib/apt/lists/*; \
-    else \
-        echo "Unsupported base image: expected dnf or apt-get" >&2; \
-        exit 1; \
-    fi
-
-RUN pip install --no-cache uv
-ENV UV_SYSTEM_PYTHON=1
-
-ENV INSTALL_MODE=${INSTALL_MODE}
-ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
-ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
-ENV PYPI_VERSION=${PYPI_VERSION}
-ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
-ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
-ENV DISTRO_NAME=${DISTRO_NAME}
-ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
-
-# Copy the repository so editable installs and run configurations are available.
-COPY . /workspace
-
-# Install the client package if it is provided
-# NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
-RUN set -eux; \
-    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
-        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
-            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
-            exit 1; \
-        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
-    fi;
-
-# Install llama-stack
-RUN set -eux; \
-    if [ "$INSTALL_MODE" = "editable" ]; then \
-        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
-            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
-            exit 1; \
-        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
-    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
-        uv pip install --no-cache fastapi libcst; \
-        if [ -n "$TEST_PYPI_VERSION" ]; then \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
-        else \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
-        fi; \
-    else \
-        if [ -n "$PYPI_VERSION" ]; then \
-            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
-        else \
-            uv pip install --no-cache llama-stack; \
-        fi; \
-    fi;
-
-# Install the dependencies for the distribution
-RUN set -eux; \
-    if [ -z "$DISTRO_NAME" ]; then \
-        echo "DISTRO_NAME must be provided" >&2; \
-        exit 1; \
-    fi; \
-    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
-    if [ -n "$deps" ]; then \
-        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
-    fi
-
-# Cleanup
-RUN set -eux; \
-    pip uninstall -y uv; \
-    should_remove=1; \
-    if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
-    if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
-    case "$RUN_CONFIG_PATH" in \
-        /workspace*) should_remove=0 ;; \
-    esac; \
-    if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
-
-RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
-#!/bin/sh
-set -e
-
-if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
-  exec llama stack run "$RUN_CONFIG_PATH" "$@"
-fi
-
-if [ -n "$DISTRO_NAME" ]; then
-  exec llama stack run "$DISTRO_NAME" "$@"
-fi
-
-exec llama stack run "$@"
-EOF
-RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
-
-RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
-
-ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -51,8 +51,8 @@ device: cpu
 You can access the HuggingFace trainer via the `starter` distribution:

 ```bash
-llama stack list-deps starter | xargs -L1 uv pip install
-llama stack run starter
+llama stack build --distro starter --image-type venv
+llama stack run ~/.llama/distributions/starter/starter-run.yaml
 ```

 ### Usage Example
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -175,7 +175,8 @@ llama-stack-client benchmarks register \
 **1. Start the Llama Stack API Server**

 ```bash
-llama stack list-deps together | xargs -L1 uv pip install
+# Build and run a distribution (example: together)
+llama stack build --distro together --image-type venv
 llama stack run together
 ```

@ -208,7 +209,7 @@ The playground works with any Llama Stack distribution. Popular options include:
 <TabItem value="together" label="Together AI">

 ```bash
-llama stack list-deps together | xargs -L1 uv pip install
+llama stack build --distro together --image-type venv
 llama stack run together
 ```

@ -221,7 +222,7 @@ llama stack run together
 <TabItem value="ollama" label="Ollama (Local)">

 ```bash
-llama stack list-deps ollama | xargs -L1 uv pip install
+llama stack build --distro ollama --image-type venv
 llama stack run ollama
 ```

@ -234,7 +235,7 @@ llama stack run ollama
 <TabItem value="meta-reference" label="Meta Reference">

 ```bash
-llama stack list-deps meta-reference | xargs -L1 uv pip install
+llama stack build --distro meta-reference --image-type venv
 llama stack run meta-reference
 ```

--- a/docs/docs/building_applications/rag.mdx
+++ b/docs/docs/building_applications/rag.mdx
@ -20,8 +20,7 @@ RAG enables your applications to reference and recall information from external
 In one terminal, start the Llama Stack server:

 ```bash
-llama stack list-deps starter | xargs -L1 uv pip install
-llama stack run starter
+uv run llama stack build --distro starter --image-type venv --run
 ```

 ### 2. Connect with OpenAI Client
@ -88,19 +87,18 @@ Llama Stack provides OpenAI-compatible RAG capabilities through:
 To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:

 ```yaml
-vector_stores:
-  default_provider_id: faiss
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
+models:
+  - model_id: nomic-ai/nomic-embed-text-v1.5
+    provider_id: inline::sentence-transformers
+    metadata:
+      embedding_dimension: 768
+      default_configured: true
 ```

 With this configuration:
- `client.vector_stores.create()` works without requiring embedding model or provider parameters
- The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
- The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
- The `default_provider_id` specifies which vector storage backend to use
- The `default_embedding_model` specifies both the inference provider and model for embeddings
+- `client.vector_stores.create()` works without requiring embedding model parameters
+- The system automatically uses the default model and its embedding dimension for any newly created vector store
+- Only one model can be marked as `default_configured: true`

 ## Vector Store Operations

@ -109,15 +107,14 @@ With this configuration:
 You can create vector stores with automatic or explicit embedding model selection:

 ```python
-# Automatic - uses default configured embedding model and vector store provider
+# Automatic - uses default configured embedding model
 vs = client.vector_stores.create()

-# Explicit - specify embedding model and/or provider when you need specific ones
+# Explicit - specify embedding model when you need a specific one
 vs = client.vector_stores.create(
    extra_body={
-        "provider_id": "faiss",  # Optional: specify vector store provider
-        "embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
-        "embedding_dimension": 768  # Optional: will be auto-detected if not provided
+        "embedding_model": "nomic-ai/nomic-embed-text-v1.5",
+        "embedding_dimension": 768
    }
 )
 ```
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -62,10 +62,6 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in

 When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.

-### Deprecated APIs
-
-Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
-
 ### API Stability vs. Provider Stability

 The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -158,16 +158,17 @@ under the LICENSE file in the root directory of this source tree.

 Some tips about common tasks you work on while contributing to Llama Stack:

-### Setup for development
+### Using `llama stack build`

+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+
+Example:
 ```bash
+cd work/
 git clone https://github.com/meta-llama/llama-stack.git
-cd llama-stack
-uv run llama stack list-deps <distro-name> | xargs -L1 uv pip install
-
-# (Optional) If you are developing the llama-stack-client-python package, you can add it as an editable package.
 git clone https://github.com/meta-llama/llama-stack-client-python.git
-uv add --editable ../llama-stack-client-python
+cd llama-stack
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```

 ### Updating distribution configurations
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@ -67,7 +67,7 @@ def get_base_url(self) -> str:

 ## Testing the Provider

-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.

 ### 1. Integration Testing

--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -5,80 +5,225 @@ sidebar_label: Build your own Distribution
 sidebar_position: 3
 ---

-This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.
+This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.

-### Explore existing distributions

-All first-party distributions live under `llama_stack/distributions/`. Each directory contains:
+### Setting your log level

- `build.yaml` – the distribution specification (providers, additional dependencies, optional external provider directories).
- `run.yaml` – sample run configuration (when provided).
- Documentation fragments that power this site.
+In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:

-Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.
+`LLAMA_STACK_LOGGING=server=debug;core=info`
+
+Where each category in the following list:
+
+- all
+- core
+- server
+- router
+- inference
+- agents
+- safety
+- eval
+- tools
+- client
+
+Can be set to any of the following log levels:
+
+- debug
+- info
+- warning
+- error
+- critical
+
+The default global log level is `info`. `all` sets the log level for all components.
+
+A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
+
+### Llama Stack Build
+
+In order to build your own distribution, we recommend you clone the `llama-stack` repository.
+
+
+```
+git clone git@github.com:meta-llama/llama-stack.git
+cd llama-stack
+pip install -e .
+```
+Use the CLI to build your distribution.
+The main points to consider are:
+1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
+2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
+3. **Config** - Do you want to use a pre-existing config file to build your distribution?
+
+```
+llama stack build -h
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
+                         [--run] [--providers PROVIDERS]
+
+Build a Llama stack container
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
+                        enter information interactively (default: None)
+  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
+                        None)
+  --distro DISTRIBUTION, --distribution DISTRIBUTION
+                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
+  --list-distros, --list-distributions
+                        Show the available distributions for building a Llama Stack distribution (default: False)
+  --image-type {container,venv}
+                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
+  --image-name IMAGE_NAME
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
+                        None)
+  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
+  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
+  --providers PROVIDERS
+                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
+                        API. (default: None)
+```
+
+After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.

 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 <Tabs>
-<TabItem value="container" label="Building a container">
+<TabItem value="template" label="Building from a template">
+To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.

-Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
-
-```bash
-docker build . \
-  -f containers/Containerfile \
-  --build-arg DISTRO_NAME=starter \
-  --tag llama-stack:starter
+The following command will allow you to see the available templates and their corresponding providers.
+```
+llama stack build --list-templates
 ```

-Handy build arguments:
+```
+------------------------------+-----------------------------------------------------------------------------+
+| Template Name                | Description                                                                 |
+------------------------------+-----------------------------------------------------------------------------+
+| watsonx                      | Use watsonx for running LLM inference                                       |
+------------------------------+-----------------------------------------------------------------------------+
+| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
+------------------------------+-----------------------------------------------------------------------------+
+| together                     | Use Together.AI for running LLM inference                                   |
+------------------------------+-----------------------------------------------------------------------------+
+| tgi                          | Use (an external) TGI server for running LLM inference                      |
+------------------------------+-----------------------------------------------------------------------------+
+| starter                      | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
+| sambanova                    | Use SambaNova for running LLM inference and safety                          |
+------------------------------+-----------------------------------------------------------------------------+
+| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
+------------------------------+-----------------------------------------------------------------------------+
+| postgres-demo                | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
+| passthrough                  | Use Passthrough hosted llama-stack endpoint for LLM inference               |
+------------------------------+-----------------------------------------------------------------------------+
+| open-benchmark               | Distribution for running open benchmarks                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| ollama                       | Use (an external) Ollama server for running LLM inference                   |
+------------------------------+-----------------------------------------------------------------------------+
+| nvidia                       | Use NVIDIA NIM for running LLM inference, evaluation and safety             |
+------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
+------------------------------+-----------------------------------------------------------------------------+
+| llama_api                    | Distribution for running e2e tests in CI                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+| groq                         | Use Groq for running LLM inference                                          |
+------------------------------+-----------------------------------------------------------------------------+
+| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
+------------------------------+-----------------------------------------------------------------------------+
+| experimental-post-training   | Experimental template for post training                                     |
+------------------------------+-----------------------------------------------------------------------------+
+| dell                         | Dell's distribution of Llama Stack. TGI inference via Dell's custom         |
+|                              | container                                                                   |
+------------------------------+-----------------------------------------------------------------------------+
+| ci-tests                     | Distribution for running e2e tests in CI                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| cerebras                     | Use Cerebras for running LLM inference                                      |
+------------------------------+-----------------------------------------------------------------------------+
+| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+------------------------------+-----------------------------------------------------------------------------+
+```

- `DISTRO_NAME` – distribution directory name (defaults to `starter`).
- `RUN_CONFIG_PATH` – absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
- `INSTALL_MODE=editable` – install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
- `LLAMA_STACK_CLIENT_DIR` – optional editable install of the Python client.
- `PYPI_VERSION` / `TEST_PYPI_VERSION` – pin specific releases when not using editable installs.
- `KEEP_WORKSPACE=1` – retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).
+You may then pick a template to build your distribution with providers fitted to your liking.

-Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.
+For example, to build a distribution with TGI as the inference provider, you can run:
+```
+$ llama stack build --distro starter
+...
+You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
+```

+```{tip}
+The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
 </TabItem>
-<TabItem value="external" label="Building with external providers">
+<TabItem value="scratch" label="Building from Scratch">

-External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.
+If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.

-1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
-2. Update `build.yaml` with the directory and provider entries.
-3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.
+It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
+```
+llama stack build

-Example `build.yaml` excerpt for a custom Ollama provider:
+> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
+> Enter the image type you want your Llama Stack to be built as (container or venv): venv
+
+Llama Stack is composed of several APIs working together. Let's select
+the provider types (implementations) you want to use for these APIs.
+
+Tip: use <TAB> to see options for the providers.
+
+> Enter provider for API inference: inline::meta-reference
+> Enter provider for API safety: inline::llama-guard
+> Enter provider for API agents: inline::meta-reference
+> Enter provider for API memory: inline::faiss
+> Enter provider for API datasetio: inline::meta-reference
+> Enter provider for API scoring: inline::meta-reference
+> Enter provider for API eval: inline::meta-reference
+> Enter provider for API telemetry: inline::meta-reference
+
+ > (Optional) Enter a short description for your Llama Stack:
+
+You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
+```
+</TabItem>
+<TabItem value="config" label="Building from a pre-existing build config file">
+- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
+
+- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
+
+```
+llama stack build --config llama_stack/distributions/starter/build.yaml
+```
+</TabItem>
+<TabItem value="external" label="Building with External Providers">
+
+Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
+
+To build a distribution with external providers, you need to:
+
+1. Configure the `external_providers_dir` in your build configuration file:

 ```yaml
+# Example my-external-stack.yaml with external providers
+version: '2'
 distribution_spec:
+  description: Custom distro for CI tests
  providers:
    inference:
-      - remote::custom_ollama
-external_providers_dir: /workspace/providers.d
-```
-
-Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
-
-```python
-from llama_stack.providers.datatypes import ProviderSpec
-
-
-def get_provider_spec() -> ProviderSpec:
-    return ProviderSpec(
-        provider_type="remote::custom_ollama",
-        module="llama_stack_ollama_provider",
-        config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
-        pip_packages=[
-            "ollama",
-            "aiohttp",
-            "llama-stack-provider-ollama",
-        ],
-    )
+    - remote::custom_ollama
+# Add more providers as needed
+image_type: container
+image_name: ci-test
+# Path to external provider implementations
+external_providers_dir: ~/.llama/providers.d
 ```

 Here's an example for a custom Ollama provider:
@ -87,9 +232,9 @@ Here's an example for a custom Ollama provider:
 adapter:
  adapter_type: custom_ollama
  pip_packages:
-    - ollama
-    - aiohttp
-    - llama-stack-provider-ollama  # This is the provider package
+  - ollama
+  - aiohttp
+  - llama-stack-provider-ollama # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
@ -100,22 +245,53 @@ The `pip_packages` section lists the Python packages required by the provider, a
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).

-For deeper guidance, see the [External Providers documentation](../providers/external/).
+2. Build your distribution using the config file:

+```
+llama stack build --config my-external-stack.yaml
+```
+
+For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
 </TabItem>
-</Tabs>
+<TabItem value="container" label="Building Container">

-### Run your stack server
+:::tip Podman Alternative
+Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
+:::

-After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:
+To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
+
+```
+llama stack build --distro starter --image-type container
+```
+
+```
+$ llama stack build --distro starter --image-type container
+...
+Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
+...
+```
+
+You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
+```
+
+Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.

 ```bash
+export INFERENCE_MODEL="llama3.2:3b"
+export LLAMA_STACK_PORT=8321
+mkdir -p ~/.llama
+```
+
+After this step is successful, you should be able to find the built container image and test it with the below Docker command:
+
+```
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -e INFERENCE_MODEL=$INFERENCE_MODEL \
  -e OLLAMA_URL=http://host.docker.internal:11434 \
-  llama-stack:starter \
+  localhost/distribution-ollama:dev \
  --port $LLAMA_STACK_PORT
 ```

@ -135,14 +311,131 @@ Here are the docker flags and their uses:

 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on

+</TabItem>
+</Tabs>


-If you prepared a custom run config, mount it into the container and reference it explicitly:
+### Running your Stack server
+Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.

-```bash
-docker run \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v $(pwd)/run.yaml:/app/run.yaml \
-  llama-stack:starter \
-  /app/run.yaml
 ```
+llama stack run -h
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
+                       [--image-type {venv}] [--enable-ui]
+                       [config | distro]
+
+Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
+
+positional arguments:
+  config | distro       Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)
+
+options:
+  -h, --help            show this help message and exit
+  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
+  --image-name IMAGE_NAME
+                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
+  --image-type {venv}
+                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
+  --enable-ui           Start the UI server (default: False)
+```
+
+**Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
+
+```
+# Start using template name
+llama stack run tgi
+
+# Start using config file
+llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+```
+
+```
+$ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+Serving API inspect
+ GET /health
+ GET /providers/list
+ GET /routes/list
+Serving API inference
+ POST /inference/chat_completion
+ POST /inference/completion
+ POST /inference/embeddings
+...
+Serving API agents
+ POST /agents/create
+ POST /agents/session/create
+ POST /agents/turn/create
+ POST /agents/delete
+ POST /agents/session/delete
+ POST /agents/session/get
+ POST /agents/step/get
+ POST /agents/turn/get
+
+Listening on ['::', '0.0.0.0']:8321
+INFO:     Started server process [2935911]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
+INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
+```
+
+### Listing Distributions
+Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
+
+```
+llama stack list -h
+usage: llama stack list [-h]
+
+list the build stacks
+
+options:
+  -h, --help  show this help message and exit
+```
+
+Example Usage
+
+```
+llama stack list
+```
+
+```
+------------------------------+-----------------------------------------------------------------+--------------+------------+
+| Stack Name                  | Path                                                            | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| together                    | ~/.llama/distributions/together                                 | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+```
+
+### Removing a Distribution
+Use the remove command to delete a distribution you've previously built.
+
+```
+llama stack rm -h
+usage: llama stack rm [-h] [--all] [name]
+
+Remove the build stack
+
+positional arguments:
+  name        Name of the stack to delete (default: None)
+
+options:
+  -h, --help  show this help message and exit
+  --all, -a   Delete all stacks (use with caution) (default: False)
+```
+
+Example
+```
+llama stack rm llamastack-test
+```
+
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
+
+### Troubleshooting
+
+If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -44,32 +44,18 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          backend: kv_default
-          namespace: agents
-        responses:
-          backend: sql_default
-          table_name: responses
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
-  references:
-    metadata:
-      backend: kv_default
-      namespace: registry
-    inference:
-      backend: sql_default
-      table_name: inference_store
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -12,7 +12,7 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack list-deps starter | xargs -L1 uv pip install
+llama stack build --distro starter --image-type venv
 ```

 ```python
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -1,155 +1,56 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-demo
-    apis:
-    - agents
-    - inference
-    - files
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-          kvstore:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      files:
-      - provider_id: meta-reference-files
-        provider_type: inline::localfs
-        config:
-          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-          metadata_store:
-            type: sqlite
-            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    storage:
-      backends:
-        kv_default:
-          type: kv_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-        sql_default:
-          type: sql_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
-        metadata:
-          backend: kv_default
-          namespace: registry
-        inference:
-          backend: sql_default
-          table_name: inference_store
-    models:
-    - metadata:
-        embedding_dimension: 768
-      model_id: nomic-embed-text-v1.5
-      provider_id: sentence-transformers
-      model_type: embedding
-    - metadata: {}
-      model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - metadata: {}
-      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8321
-      auth:
-        provider_config:
-          type: github_token
+  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
+    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
+    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
+    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
+    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
+    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
+    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
+    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
+    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
+    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
+    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
+    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
+    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
+    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
+    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
+    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
+    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
+    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
+    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
+    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
+    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
+    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
+    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
+    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
+    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
+    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
+    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
+    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
+    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
+    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
+    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
+    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
+    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
+    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
+    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
+    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
+    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
+    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
+    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
+    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
+    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
+    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
+    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
+    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
+    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
+    \     type: github_token\n"
 kind: ConfigMap
 metadata:
+  creationTimestamp: null
  name: llama-stack-config
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -32,17 +32,21 @@ providers:
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
+      kvstore:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -52,15 +56,20 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -84,70 +93,48 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    sql_default:
-      type: sql_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-registered_resources:
-  models:
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-    model_type: llm
-  - metadata: {}
-    model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    provider_id: vllm-safety
-    model_type: llm
-  shields:
-  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+models:
+- metadata:
+    embedding_dimension: 768
+  model_id: nomic-embed-text-v1.5
+  provider_id: sentence-transformers
+  model_type: embedding
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+  provider_id: vllm-safety
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
 server:
  port: 8321
  auth:
    provider_config:
      type: github_token
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: chromadb
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/docs/docs/distributions/ondevice_distro/android_sdk.md
+++ b/docs/docs/distributions/ondevice_distro/android_sdk.md
@ -59,7 +59,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 uv venv starter --python 3.12
 source starter/bin/activate  # On Windows: starter\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack list-deps starter | xargs -L1 uv pip install
+llama stack build --distro starter --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run starter --port 5050
 ```
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -166,10 +166,10 @@ docker run \

 ### Via venv

-Install the distribution dependencies before launching:
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack list-deps dell | xargs -L1 uv pip install
+llama stack build --distro dell --image-type venv
 INFERENCE_MODEL=$INFERENCE_MODEL \
 DEH_URL=$DEH_URL \
 CHROMA_URL=$CHROMA_URL \
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -81,10 +81,10 @@ docker run \

 ### Via venv

-Make sure you have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
+llama stack build --distro meta-reference-gpu --image-type venv
 INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 8321
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -136,11 +136,11 @@ docker run \

 ### Via venv

-If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
+If you've set up your local development environment, you can also build the image using your local virtual environment.

 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-llama stack list-deps nvidia | xargs -L1 uv pip install
+llama stack build --distro nvidia --image-type venv
 NVIDIA_API_KEY=$NVIDIA_API_KEY \
 INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -169,11 +169,7 @@ docker run \
 Ensure you have configured the starter distribution using the environment variables explained above.

 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-uv run --with llama-stack llama stack run starter
+uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```

 ## Example Usage
--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@ -23,17 +23,6 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.


-## Configure logging
-
-Control log output via environment variables before starting the server.
-
- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
- Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
- Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
- `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
-
-Export these variables prior to running `llama stack run`, launching a container, or starting the server through any other pathway.
-
 ```{toctree}
 :maxdepth: 1
 :hidden:
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -4,24 +4,65 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-import io, requests
-from openai import OpenAI
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+models = client.models.list()

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

-print(resp)
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -58,19 +58,15 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th

 <Tabs>
 <TabItem value="venv" label="Using venv">
-You can use Python to install dependencies and run the Llama Stack server, which is useful for testing and development.
+You can use Python to build and run the Llama Stack server, which is useful for testing and development.

 Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
 which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
-Now let's install dependencies and run the Llama Stack config for Ollama.
+Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.

 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-llama stack run starter
+llama stack build --distro starter --image-type venv --run
 ```
 </TabItem>
 <TabItem value="container" label="Using a Container">
@ -308,7 +304,7 @@ stream = agent.create_turn(
 for event in AgentEventLogger().log(stream):
    event.print()
 ```
-#### ii. Run the Script
+### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python agent.py
--- a/docs/docs/getting_started/quickstart.mdx
+++ b/docs/docs/getting_started/quickstart.mdx
@ -24,62 +24,111 @@ ollama run llama3.2:3b --keepalive 60m

 #### Step 2: Run the Llama Stack server

-We will use `uv` to install dependencies and run the Llama Stack server.
+We will use `uv` to run the Llama Stack server.
 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
+OLLAMA_URL=http://localhost:11434 \
+  uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python
-import io, requests
-from openai import OpenAI
+```python title="demo_script.py"
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

+prompt = "How do you do great work?"
+print("prompt>", prompt)

+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
+```
 We will use `uv` to run the script
 ```
 uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
-```python
->print(resp.output[1].content[0].text)
-To do great work, consider the following principles:
-
-1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
-
-2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
-
-3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
-
-4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.
-
-5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
-
-6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
-
-By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
 ```
+rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html

+prompt> How do you do great work?
+
+inference> [knowledge_search(query="What is the key to doing great work")]
+
+tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
+
+tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
+
+inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
+
+To further clarify, I would suggest that doing great work involves:
+
+* Completing tasks with high quality and attention to detail
+* Expanding on existing knowledge or ideas
+* Making a positive impact on others through your work
+* Striving for excellence and continuous improvement
+
+Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
+```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

 :::tip HuggingFace access
--- a/docs/docs/providers/agents/inline_meta-reference.mdx
+++ b/docs/docs/providers/agents/inline_meta-reference.mdx
@ -14,18 +14,16 @@ Meta's reference implementation of an agent system that can use tools, access ve

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No |  |  |
+| `persistence_store` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `responses_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  agent_state:
-    namespace: agents
-    backend: kv_default
-  responses:
-    table_name: responses
-    backend: sql_default
-    max_write_queue_size: 10000
-    num_writers: 4
+persistence_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/agents_store.db
+responses_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/responses_store.db
 ```
--- a/docs/docs/providers/batches/inline_reference.mdx
+++ b/docs/docs/providers/batches/inline_reference.mdx
@ -14,7 +14,7 @@ Reference implementation of batches API with KVStore persistence.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Configuration for the key-value store backend. |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
 | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
 | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |

@ -22,6 +22,6 @@ Reference implementation of batches API with KVStore persistence.

 ```yaml
 kvstore:
-  namespace: batches
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
 ```
--- a/docs/docs/providers/datasetio/inline_localfs.mdx
+++ b/docs/docs/providers/datasetio/inline_localfs.mdx
@ -14,12 +14,12 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: datasetio::localfs
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/localfs_datasetio.db
 ```
--- a/docs/docs/providers/datasetio/remote_huggingface.mdx
+++ b/docs/docs/providers/datasetio/remote_huggingface.mdx
@ -14,12 +14,12 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: datasetio::huggingface
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/huggingface_datasetio.db
 ```
--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -1,7 +1,5 @@
 ---
-description: "Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."
+description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
 sidebar_label: Eval
 title: Eval
 ---
@ -10,8 +8,6 @@ title: Eval

 ## Overview

-Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Llama Stack Evaluation API for running evaluations on model and agent candidates.

 This section contains documentation for all available providers for the **eval** API.
--- a/docs/docs/providers/eval/inline_meta-reference.mdx
+++ b/docs/docs/providers/eval/inline_meta-reference.mdx
@ -14,12 +14,12 @@ Meta's reference implementation of evaluation tasks with support for multiple la

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: eval
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
 ```
--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -240,6 +240,6 @@ additional_pip_packages:
 - sqlalchemy[asyncio]
 ```

-No other steps are required beyond installing dependencies with `llama stack list-deps <distro> | xargs -L1 uv pip install` and then running `llama stack run`. The CLI will use `module` to install the provider dependencies, retrieve the spec, etc.
+No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc.

 The provider will now be available in Llama Stack with the type `remote::ramalama`.
--- a/docs/docs/providers/files/inline_localfs.mdx
+++ b/docs/docs/providers/files/inline_localfs.mdx
@ -15,7 +15,7 @@ Local filesystem-based file storage provider for managing files and documents lo
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
 | `ttl_secs` | `<class 'int'>` | No | 31536000 |  |

 ## Sample Configuration
@ -23,6 +23,6 @@ Local filesystem-based file storage provider for managing files and documents lo
 ```yaml
 storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/dummy/files}
 metadata_store:
-  table_name: files_metadata
-  backend: sql_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/files_metadata.db
 ```
--- a/docs/docs/providers/files/remote_s3.mdx
+++ b/docs/docs/providers/files/remote_s3.mdx
@ -20,7 +20,7 @@ AWS S3-based file storage provider for scalable cloud file management with metad
 | `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
 | `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
 | `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |

 ## Sample Configuration

@ -32,6 +32,6 @@ aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
 endpoint_url: ${env.S3_ENDPOINT_URL:=}
 auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
 metadata_store:
-  table_name: s3_files_metadata
-  backend: sql_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
 ```
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -3,10 +3,9 @@ description: "Inference

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query."
+    - Embedding models: these models generate embeddings to be used for semantic search."
 sidebar_label: Inference
 title: Inference
 ---
@ -19,9 +18,8 @@ Inference

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.

 This section contains documentation for all available providers for the **inference** API.
--- a/docs/docs/providers/vector_io/inline_chromadb.mdx
+++ b/docs/docs/providers/vector_io/inline_chromadb.mdx
@ -79,13 +79,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |

 ## Sample Configuration

 ```yaml
 db_path: ${env.CHROMADB_PATH}
-persistence:
-  namespace: vector_io::chroma
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_inline_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_faiss.mdx
+++ b/docs/docs/providers/vector_io/inline_faiss.mdx
@ -95,12 +95,12 @@ more details about Faiss in general.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  namespace: vector_io::faiss
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
 ```
--- a/docs/docs/providers/vector_io/inline_meta-reference.mdx
+++ b/docs/docs/providers/vector_io/inline_meta-reference.mdx
@ -14,14 +14,14 @@ Meta's reference implementation of a vector database.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  namespace: vector_io::faiss
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
 ```
 ## Deprecation Notice

--- a/docs/docs/providers/vector_io/inline_milvus.mdx
+++ b/docs/docs/providers/vector_io/inline_milvus.mdx
@ -17,14 +17,14 @@ Please refer to the remote provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |

 ## Sample Configuration

 ```yaml
 db_path: ${env.MILVUS_DB_PATH:=~/.llama/dummy}/milvus.db
-persistence:
-  namespace: vector_io::milvus
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_qdrant.mdx
+++ b/docs/docs/providers/vector_io/inline_qdrant.mdx
@ -98,13 +98,13 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db
-persistence:
-  namespace: vector_io::qdrant
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -408,13 +408,13 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
-persistence:
-  namespace: vector_io::sqlite_vec
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
@ -17,15 +17,15 @@ Please refer to the sqlite-vec provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
-persistence:
-  namespace: vector_io::sqlite_vec
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
 ```
 ## Deprecation Notice

--- a/docs/docs/providers/vector_io/remote_chromadb.mdx
+++ b/docs/docs/providers/vector_io/remote_chromadb.mdx
@ -78,13 +78,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `str \| None` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |

 ## Sample Configuration

 ```yaml
 url: ${env.CHROMADB_URL}
-persistence:
-  namespace: vector_io::chroma_remote
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_remote_registry.db
 ```
--- a/docs/docs/providers/vector_io/remote_milvus.mdx
+++ b/docs/docs/providers/vector_io/remote_milvus.mdx
@ -408,7 +408,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | `uri` | `<class 'str'>` | No |  | The URI of the Milvus server |
 | `token` | `str \| None` | No |  | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

 :::note
@ -420,7 +420,7 @@ This configuration class accepts additional fields beyond those listed above. Yo
 ```yaml
 uri: ${env.MILVUS_ENDPOINT}
 token: ${env.MILVUS_TOKEN}
-persistence:
-  namespace: vector_io::milvus_remote
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db
 ```
--- a/docs/docs/providers/vector_io/remote_pgvector.mdx
+++ b/docs/docs/providers/vector_io/remote_pgvector.mdx
@ -218,7 +218,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 | `db` | `str \| None` | No | postgres |  |
 | `user` | `str \| None` | No | postgres |  |
 | `password` | `str \| None` | No | mysecretpassword |  |
-| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

@ -228,7 +228,7 @@ port: ${env.PGVECTOR_PORT:=5432}
 db: ${env.PGVECTOR_DB}
 user: ${env.PGVECTOR_USER}
 password: ${env.PGVECTOR_PASSWORD}
-persistence:
-  namespace: vector_io::pgvector
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db
 ```
--- a/docs/docs/providers/vector_io/remote_qdrant.mdx
+++ b/docs/docs/providers/vector_io/remote_qdrant.mdx
@ -26,13 +26,13 @@ Please refer to the inline provider documentation.
 | `prefix` | `str \| None` | No |  |  |
 | `timeout` | `int \| None` | No |  |  |
 | `host` | `str \| None` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 api_key: ${env.QDRANT_API_KEY:=}
-persistence:
-  namespace: vector_io::qdrant_remote
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
 ```
--- a/docs/docs/providers/vector_io/remote_weaviate.mdx
+++ b/docs/docs/providers/vector_io/remote_weaviate.mdx
@ -75,14 +75,14 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
 |-------|------|----------|---------|-------------|
 | `weaviate_api_key` | `str \| None` | No |  | The API key for the Weaviate instance |
 | `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
-| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 weaviate_api_key: null
 weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
-persistence:
-  namespace: vector_io::weaviate
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db
 ```
--- a/docs/docs/references/llama_stack_client_cli_reference.md
+++ b/docs/docs/references/llama_stack_client_cli_reference.md
@ -32,6 +32,7 @@ Commands:
  scoring_functions  Manage scoring functions.
  shields            Manage safety shield services.
  toolgroups         Manage available tool groups.
+  vector_dbs         Manage vector databases.
 ```

 ### `llama-stack-client configure`
@ -210,6 +211,53 @@ Unregister a model from distribution endpoint
 llama-stack-client models unregister <model_id>
 ```

+## Vector DB Management
+Manage vector databases.
+
+
+### `llama-stack-client vector_dbs list`
+Show available vector dbs on distribution endpoint
+```bash
+llama-stack-client vector_dbs list
+```
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier               ┃ provider_id ┃ provider_resource_id     ┃ vector_db_type ┃ params                            ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ my_demo_vector_db        │ faiss       │ my_demo_vector_db        │                │ embedding_dimension: 768          │
+│                          │             │                          │                │ embedding_model: nomic-embed-text-v1.5 │
+│                          │             │                          │                │ type: vector_db                   │
+│                          │             │                          │                │                                   │
+└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
+```
+
+### `llama-stack-client vector_dbs register`
+Create a new vector db
+```bash
+llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
+```
+
+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+Optional arguments:
+- `--provider-id`: Provider ID for the vector db
+- `--provider-vector-db-id`: Provider's vector db ID
+- `--embedding-model`: Embedding model to use. Default: `nomic-embed-text-v1.5`
+- `--embedding-dimension`: Dimension of embeddings. Default: 768
+
+### `llama-stack-client vector_dbs unregister`
+Delete a vector db
+```bash
+llama-stack-client vector_dbs unregister <vector-db-id>
+```
+
+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+
 ## Shield Management
 Manage safety shield services.
 ### `llama-stack-client shields list`
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -2864,7 +2864,7 @@
    }
   ],
   "source": [
-    "!llama stack list-deps experimental-post-training | xargs -L1 uv pip install"
+    "!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
   ]
  },
  {
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -38,7 +38,7 @@
   "source": [
    "# NBVAL_SKIP\n",
    "!pip install -U llama-stack\n",
-    "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
+    "!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
   ]
  },
  {
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/crewai/Llama_Stack_CrewAI.ipynb
+++ b/docs/notebooks/crewai/Llama_Stack_CrewAI.ipynb
@ -136,8 +136,7 @@
    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
-    "        \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
-    "        \"uv run --with llama-stack llama stack run starter\",\n",
+    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
@ -173,7 +172,7 @@
    "\n",
    "def kill_llama_stack_server():\n",
    "    # Kill any existing llama stack server processes using pkill command\n",
-    "    os.system(\"pkill -f llama_stack.core.server.server\")\n"
+    "    os.system(\"pkill -f llama_stack.core.server.server\")"
   ]
  },
  {
--- a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
+++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
@ -105,8 +105,7 @@
    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
-    "        \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
-    "        \"uv run --with llama-stack llama stack run starter\",\n",
+    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -92,7 +92,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
--- a/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/1_data_preparation.ipynb
@ -81,7 +81,7 @@
   "metadata": {},
   "source": [
    "```bash\n",
-    "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
    "```"
   ]
  },
--- a/docs/openapi_generator/run_openapi_generator.sh
+++ b/docs/openapi_generator/run_openapi_generator.sh
@ -30,5 +30,3 @@ fi
 stack_dir=$(dirname $(dirname $THIS_DIR))
 PYTHONPATH=$PYTHONPATH:$stack_dir \
  python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/static
-
-cp $stack_dir/docs/static/stainless-llama-stack-spec.yaml $stack_dir/client-sdks/stainless/openapi.yml
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -1,399 +1,366 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "c1e7571c",
-   "metadata": {
-    "id": "c1e7571c"
-   },
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
-    "\n",
-    "# Llama Stack - Building AI Applications\n",
-    "\n",
-    "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
-    "\n",
-    "Get started with Llama Stack in minutes!\n",
-    "\n",
-    "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
-    "\n",
-    "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
-    "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4CV1Q19BDMVw",
-   "metadata": {
-    "id": "4CV1Q19BDMVw"
-   },
-   "source": [
-    "## Step 1: Install and setup"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "K4AvfUAJZOeS",
-   "metadata": {
-    "id": "K4AvfUAJZOeS"
-   },
-   "source": [
-    "### 1.1. Install uv and test inference with Ollama\n",
-    "\n",
-    "We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7a2d7b85",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install uv llama_stack llama-stack-client\n",
-    "\n",
-    "## If running on Collab:\n",
-    "# !pip install colab-xterm\n",
-    "# %load_ext colabxterm\n",
-    "\n",
-    "!curl https://ollama.ai/install.sh | sh"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "39fa584b",
-   "metadata": {},
-   "source": [
-    "### 1.2. Test inference with Ollama"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3bf81522",
-   "metadata": {},
-   "source": [
-    "We’ll now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a7e8e0f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## If running on Colab:\n",
-    "# %xterm\n",
-    "\n",
-    "## To be ran in the terminal:\n",
-    "# ollama serve &\n",
-    "# ollama run llama3.2:3b --keepalive 60m"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f3c5f243",
-   "metadata": {},
-   "source": [
-    "If successful, you should see the model respond to a prompt.\n",
-    "\n",
-    "...\n",
-    "```\n",
-    ">>> hi\n",
-    "Hello! How can I assist you today?\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "oDUB7M_qe-Gs",
-   "metadata": {
-    "id": "oDUB7M_qe-Gs"
-   },
-   "source": [
-    "## Step 2: Run the Llama Stack server\n",
-    "\n",
-    "In this showcase, we will start a Llama Stack server that is running locally."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "732eadc6",
-   "metadata": {},
-   "source": [
-    "### 2.1. Setup the Llama Stack Server"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "J2kGed0R5PSf",
-   "metadata": {
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "c1e7571c",
+      "metadata": {
+        "id": "c1e7571c"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
+        "\n",
+        "# Llama Stack - Building AI Applications\n",
+        "\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "\n",
+        "Get started with Llama Stack in minutes!\n",
+        "\n",
+        "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
+        "\n",
+        "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
+        "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "4CV1Q19BDMVw",
+      "metadata": {
+        "id": "4CV1Q19BDMVw"
+      },
+      "source": [
+        "## Step 1: Install and setup"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "K4AvfUAJZOeS",
+      "metadata": {
+        "id": "K4AvfUAJZOeS"
+      },
+      "source": [
+        "### 1.1. Install uv and test inference with Ollama\n",
+        "\n",
+        "We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "7a2d7b85",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install uv llama_stack llama-stack-client\n",
+        "\n",
+        "## If running on Collab:\n",
+        "# !pip install colab-xterm\n",
+        "# %load_ext colabxterm\n",
+        "\n",
+        "!curl https://ollama.ai/install.sh | sh"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "39fa584b",
+      "metadata": {},
+      "source": [
+        "### 1.2. Test inference with Ollama"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "3bf81522",
+      "metadata": {},
+      "source": [
+        "We’ll now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a7e8e0f1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "## If running on Colab:\n",
+        "# %xterm\n",
+        "\n",
+        "## To be ran in the terminal:\n",
+        "# ollama serve &\n",
+        "# ollama run llama3.2:3b --keepalive 60m"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "f3c5f243",
+      "metadata": {},
+      "source": [
+        "If successful, you should see the model respond to a prompt.\n",
+        "\n",
+        "...\n",
+        "```\n",
+        ">>> hi\n",
+        "Hello! How can I assist you today?\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "oDUB7M_qe-Gs",
+      "metadata": {
+        "id": "oDUB7M_qe-Gs"
+      },
+      "source": [
+        "## Step 2: Run the Llama Stack server\n",
+        "\n",
+        "In this showcase, we will start a Llama Stack server that is running locally."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "732eadc6",
+      "metadata": {},
+      "source": [
+        "### 2.1. Setup the Llama Stack Server"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "J2kGed0R5PSf",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "collapsed": true,
+        "id": "J2kGed0R5PSf",
+        "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import subprocess\n",
+        "\n",
+        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
+        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
+        "\n",
+        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
+        "!uv run --with llama-stack llama stack build --distro starter\n",
+        "\n",
+        "def run_llama_stack_server_background():\n",
+        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
+        "    process = subprocess.Popen(\n",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
+        "        shell=True,\n",
+        "        stdout=log_file,\n",
+        "        stderr=log_file,\n",
+        "        text=True\n",
+        "    )\n",
+        "\n",
+        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
+        "    return process\n",
+        "\n",
+        "def wait_for_server_to_start():\n",
+        "    import requests\n",
+        "    from requests.exceptions import ConnectionError\n",
+        "    import time\n",
+        "\n",
+        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
+        "    max_retries = 30\n",
+        "    retry_interval = 1\n",
+        "\n",
+        "    print(\"Waiting for server to start\", end=\"\")\n",
+        "    for _ in range(max_retries):\n",
+        "        try:\n",
+        "            response = requests.get(url)\n",
+        "            if response.status_code == 200:\n",
+        "                print(\"\\nServer is ready!\")\n",
+        "                return True\n",
+        "        except ConnectionError:\n",
+        "            print(\".\", end=\"\", flush=True)\n",
+        "            time.sleep(retry_interval)\n",
+        "\n",
+        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
+        "    return False\n",
+        "\n",
+        "\n",
+        "# use this helper if needed to kill the server\n",
+        "def kill_llama_stack_server():\n",
+        "    # Kill any existing llama stack server processes\n",
+        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "c40e9efd",
+      "metadata": {},
+      "source": [
+        "### 2.2. Start the Llama Stack Server"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "f779283d",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Starting Llama Stack server with PID: 787100\n",
+            "Waiting for server to start\n",
+            "Server is ready!\n"
+          ]
+        }
+      ],
+      "source": [
+        "server_process = run_llama_stack_server_background()\n",
+        "assert wait_for_server_to_start()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "28477c03",
+      "metadata": {},
+      "source": [
+        "## Step 3: Run the demo"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "id": "7da71011",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html\n",
+            "prompt> How do you do great work?\n",
+            "\u001b[33minference> \u001b[0m\u001b[33m[k\u001b[0m\u001b[33mnowledge\u001b[0m\u001b[33m_search\u001b[0m\u001b[33m(query\u001b[0m\u001b[33m=\"\u001b[0m\u001b[33mWhat\u001b[0m\u001b[33m is\u001b[0m\u001b[33m the\u001b[0m\u001b[33m key\u001b[0m\u001b[33m to\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m\")]\u001b[0m\u001b[97m\u001b[0m\n",
+            "\u001b[32mtool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}\u001b[0m\n",
+            "\u001b[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n', type='text'), TextContentItem(text=\"Result 1:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 2:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 3:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 4:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 5:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text='END of knowledge_search tool results.\\n', type='text'), TextContentItem(text='The above results were retrieved to help answer the user\\'s query: \"What is the key to doing great work\". Use them as supporting information only in answering this query.\\n', type='text')]\u001b[0m\n",
+            "\u001b[33minference> \u001b[0m\u001b[33mDoing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m means\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m something\u001b[0m\u001b[33m important\u001b[0m\u001b[33m so\u001b[0m\u001b[33m well\u001b[0m\u001b[33m that\u001b[0m\u001b[33m you\u001b[0m\u001b[33m expand\u001b[0m\u001b[33m people\u001b[0m\u001b[33m's\u001b[0m\u001b[33m ideas\u001b[0m\u001b[33m of\u001b[0m\u001b[33m what\u001b[0m\u001b[33m's\u001b[0m\u001b[33m possible\u001b[0m\u001b[33m.\u001b[0m\u001b[33m However\u001b[0m\u001b[33m,\u001b[0m\u001b[33m there\u001b[0m\u001b[33m's\u001b[0m\u001b[33m no\u001b[0m\u001b[33m threshold\u001b[0m\u001b[33m for\u001b[0m\u001b[33m importance\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m's\u001b[0m\u001b[33m often\u001b[0m\u001b[33m hard\u001b[0m\u001b[33m to\u001b[0m\u001b[33m judge\u001b[0m\u001b[33m at\u001b[0m\u001b[33m the\u001b[0m\u001b[33m time\u001b[0m\u001b[33m anyway\u001b[0m\u001b[33m.\u001b[0m\u001b[33m Great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m is\u001b[0m\u001b[33m a\u001b[0m\u001b[33m matter\u001b[0m\u001b[33m of\u001b[0m\u001b[33m degree\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m can\u001b[0m\u001b[33m be\u001b[0m\u001b[33m difficult\u001b[0m\u001b[33m to\u001b[0m\u001b[33m determine\u001b[0m\u001b[33m whether\u001b[0m\u001b[33m someone\u001b[0m\u001b[33m has\u001b[0m\u001b[33m done\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m until\u001b[0m\u001b[33m after\u001b[0m\u001b[33m the\u001b[0m\u001b[33m fact\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
+            "\u001b[30m\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
+        "\n",
+        "vector_db_id = \"my_demo_vector_db\"\n",
+        "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
+        "\n",
+        "models = client.models.list()\n",
+        "\n",
+        "# Select the first ollama and first ollama's embedding model\n",
+        "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
+        "embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
+        "embedding_model_id = embedding_model.identifier\n",
+        "embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
+        "\n",
+        "_ = client.vector_dbs.register(\n",
+        "    vector_db_id=vector_db_id,\n",
+        "    embedding_model=embedding_model_id,\n",
+        "    embedding_dimension=embedding_dimension,\n",
+        "    provider_id=\"faiss\",\n",
+        ")\n",
+        "source = \"https://www.paulgraham.com/greatwork.html\"\n",
+        "print(\"rag_tool> Ingesting document:\", source)\n",
+        "document = RAGDocument(\n",
+        "    document_id=\"document_1\",\n",
+        "    content=source,\n",
+        "    mime_type=\"text/html\",\n",
+        "    metadata={},\n",
+        ")\n",
+        "client.tool_runtime.rag_tool.insert(\n",
+        "    documents=[document],\n",
+        "    vector_db_id=vector_db_id,\n",
+        "    chunk_size_in_tokens=50,\n",
+        ")\n",
+        "agent = Agent(\n",
+        "    client,\n",
+        "    model=model_id,\n",
+        "    instructions=\"You are a helpful assistant\",\n",
+        "    tools=[\n",
+        "        {\n",
+        "            \"name\": \"builtin::rag/knowledge_search\",\n",
+        "            \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
+        "        }\n",
+        "    ],\n",
+        ")\n",
+        "\n",
+        "prompt = \"How do you do great work?\"\n",
+        "print(\"prompt>\", prompt)\n",
+        "\n",
+        "response = agent.create_turn(\n",
+        "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+        "    session_id=agent.create_session(\"rag_session\"),\n",
+        "    stream=True,\n",
+        ")\n",
+        "\n",
+        "for log in AgentEventLogger().log(response):\n",
+        "    log.print()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "341aaadf",
+      "metadata": {},
+      "source": [
+        "Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e88e1185",
+      "metadata": {},
+      "source": [
+        "## Next Steps"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "bcb73600",
+      "metadata": {},
+      "source": [
+        "Now you're ready to dive deeper into Llama Stack!\n",
+        "- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
+        "- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
+        "- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
+        "- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
+        "- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
+        "- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
+        "- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
    "colab": {
-     "base_uri": "https://localhost:8080/"
+      "gpuType": "T4",
+      "provenance": []
    },
-    "id": "J2kGed0R5PSf",
-    "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m52 packages\u001b[0m \u001b[2min 1.56s\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 122ms\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 197ms\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 11ms\u001b[0m\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import subprocess\n",
-    "\n",
-    "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
-    "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
-    "\n",
-    "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-    "!uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\n",
-    "\n",
-    "def run_llama_stack_server_background():\n",
-    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
-    "    process = subprocess.Popen(\n",
-    "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\",\n",
-    "        shell=True,\n",
-    "        stdout=log_file,\n",
-    "        stderr=log_file,\n",
-    "        text=True\n",
-    "    )\n",
-    "\n",
-    "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
-    "    return process\n",
-    "\n",
-    "def wait_for_server_to_start():\n",
-    "    import requests\n",
-    "    from requests.exceptions import ConnectionError\n",
-    "    import time\n",
-    "\n",
-    "    url = \"http://0.0.0.0:8321/v1/health\"\n",
-    "    max_retries = 30\n",
-    "    retry_interval = 1\n",
-    "\n",
-    "    print(\"Waiting for server to start\", end=\"\")\n",
-    "    for _ in range(max_retries):\n",
-    "        try:\n",
-    "            response = requests.get(url)\n",
-    "            if response.status_code == 200:\n",
-    "                print(\"\\nServer is ready!\")\n",
-    "                return True\n",
-    "        except ConnectionError:\n",
-    "            print(\".\", end=\"\", flush=True)\n",
-    "            time.sleep(retry_interval)\n",
-    "\n",
-    "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
-    "    return False\n",
-    "\n",
-    "\n",
-    "# use this helper if needed to kill the server\n",
-    "def kill_llama_stack_server():\n",
-    "    # Kill any existing llama stack server processes\n",
-    "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c40e9efd",
-   "metadata": {},
-   "source": [
-    "### 2.2. Start the Llama Stack Server"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f779283d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Starting Llama Stack server with PID: 20778\n",
-      "Waiting for server to start........\n",
-      "Server is ready!\n"
-     ]
-    }
-   ],
-   "source": [
-    "server_process = run_llama_stack_server_background()\n",
-    "assert wait_for_server_to_start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "28477c03",
-   "metadata": {},
-   "source": [
-    "## Step 3: Run the demo"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "7da71011",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/files \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/vector_stores \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/conversations \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/responses \"HTTP/1.1 200 OK\"\n"
-     ]
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "prompt> How do you do great work?\n",
-      "🤔 Doing great work involves a combination of skills, habits, and mindsets. Here are some key principles:\n",
-      "\n",
-      "1. **Set Clear Goals**: Start with a clear vision of what you want to achieve. Define specific, measurable, achievable, relevant, and time-bound (SMART) goals.\n",
-      "\n",
-      "2. **Plan and Prioritize**: Break your goals into smaller, manageable tasks. Prioritize these tasks based on their importance and urgency.\n",
-      "\n",
-      "3. **Focus on Quality**: Aim for high-quality outcomes rather than just finishing tasks. Pay attention to detail, and ensure your work meets or exceeds standards.\n",
-      "\n",
-      "4. **Stay Organized**: Keep your workspace, both physical and digital, organized to help you stay focused and efficient.\n",
-      "\n",
-      "5. **Manage Your Time**: Use time management techniques such as the Pomodoro Technique, time blocking, or the Eisenhower Box to maximize productivity.\n",
-      "\n",
-      "6. **Seek Feedback and Learn**: Regularly seek feedback from peers, mentors, or supervisors. Use constructive criticism to improve continuously.\n",
-      "\n",
-      "7. **Innovate and Improve**: Look for ways to improve processes or introduce new ideas. Be open to change and willing to adapt.\n",
-      "\n",
-      "8. **Stay Motivated and Persistent**: Keep your end goals in mind to stay motivated. Overcome setbacks with resilience and persistence.\n",
-      "\n",
-      "9. **Balance and Rest**: Ensure you maintain a healthy work-life balance. Take breaks and manage stress to sustain long-term productivity.\n",
-      "\n",
-      "10. **Reflect and Adjust**: Regularly assess your progress and adjust your strategies as needed. Reflect on what works well and what doesn't.\n",
-      "\n",
-      "By incorporating these elements, you can consistently produce high-quality work and achieve excellence in your endeavors.\n"
-     ]
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.6"
    }
-   ],
-   "source": [
-    "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
-    "import requests\n",
-    "\n",
-    "vector_store_id = \"my_demo_vector_db\"\n",
-    "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
-    "\n",
-    "models = client.models.list()\n",
-    "\n",
-    "# Select the first ollama and first ollama's embedding model\n",
-    "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
-    "\n",
-    "\n",
-    "source = \"https://www.paulgraham.com/greatwork.html\"\n",
-    "response = requests.get(source)\n",
-    "file = client.files.create(\n",
-    "    file=response.content,\n",
-    "    purpose='assistants'\n",
-    ")\n",
-    "vector_store = client.vector_stores.create(\n",
-    "    name=vector_store_id,\n",
-    "    file_ids=[file.id],\n",
-    ")\n",
-    "\n",
-    "agent = Agent(\n",
-    "    client,\n",
-    "    model=model_id,\n",
-    "    instructions=\"You are a helpful assistant\",\n",
-    "    tools=[\n",
-    "        {\n",
-    "            \"type\": \"file_search\",\n",
-    "            \"vector_store_ids\": [vector_store_id],\n",
-    "        }\n",
-    "    ],\n",
-    ")\n",
-    "\n",
-    "prompt = \"How do you do great work?\"\n",
-    "print(\"prompt>\", prompt)\n",
-    "\n",
-    "response = agent.create_turn(\n",
-    "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
-    "    session_id=agent.create_session(\"rag_session\"),\n",
-    "    stream=True,\n",
-    ")\n",
-    "\n",
-    "for log in AgentEventLogger().log(response):\n",
-    "    print(log, end=\"\")"
-   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "341aaadf",
-   "metadata": {},
-   "source": [
-    "Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e88e1185",
-   "metadata": {},
-   "source": [
-    "## Next Steps"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bcb73600",
-   "metadata": {},
-   "source": [
-    "Now you're ready to dive deeper into Llama Stack!\n",
-    "- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
-    "- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
-    "- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
-    "- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
-    "- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
-    "- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
-    "- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat": 4,
+  "nbformat_minor": 5
 }
--- a/docs/src/pages/index.js
+++ b/docs/src/pages/index.js
@ -47,11 +47,11 @@ function QuickStart() {
              <pre><code>{`# Install uv and start Ollama
 ollama run llama3.2:3b --keepalive 60m

-# Install server dependencies
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
 # Run Llama Stack server
-OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
+OLLAMA_URL=http://localhost:11434 \\
+  uv run --with llama-stack \\
+  llama stack build --distro starter \\
+  --image-type venv --run

 # Try the Python SDK
 from llama_stack_client import LlamaStackClient
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -5547,7 +5547,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -5798,7 +5798,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -9024,10 +9024,6 @@
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
-                    },
                    "input": {
                        "type": "array",
                        "items": {
@ -9905,10 +9901,6 @@
                    "usage": {
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
-                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
                    }
                },
                "additionalProperties": false,
@ -13457,8 +13449,8 @@
        },
        {
            "name": "Eval",
-            "description": "Llama Stack Evaluation API for running evaluations on model and agent candidates.",
-            "x-displayName": "Evaluations"
+            "description": "",
+            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
        },
        {
            "name": "Files",
@ -13467,7 +13459,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -4114,7 +4114,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -4303,7 +4303,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -6734,10 +6734,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
        input:
          type: array
          items:
@ -7407,10 +7403,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
      additionalProperties: false
      required:
        - created_at
@ -10204,9 +10196,9 @@ tags:
  - name: Datasets
    description: ''
  - name: Eval
-    description: >-
+    description: ''
+    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
-    x-displayName: Evaluations
  - name: Files
    description: >-
      This API is used to upload documents that can be used with other Llama Stack
@ -10218,16 +10210,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Models
    description: ''
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -1850,7 +1850,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -3983,7 +3983,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -5518,8 +5518,8 @@
        },
        {
            "name": "Eval",
-            "description": "Llama Stack Evaluation API for running evaluations on model and agent candidates.",
-            "x-displayName": "Evaluations"
+            "description": "",
+            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
        },
        {
            "name": "PostTraining (Coming Soon)",
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1320,7 +1320,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -2927,7 +2927,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -4119,9 +4119,9 @@ tags:
  - name: Datasets
    description: ''
  - name: Eval
-    description: >-
+    description: ''
+    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
-    x-displayName: Evaluations
  - name: PostTraining (Coming Soon)
    description: ''
 x-tagGroups:
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -282,7 +282,7 @@
                    "Conversations"
                ],
                "summary": "Create a conversation.",
-                "description": "Create a conversation.\nCreate a conversation.",
+                "description": "Create a conversation.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -326,8 +326,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Retrieve a conversation.",
-                "description": "Retrieve a conversation.\nGet a conversation with the given ID.",
+                "summary": "Get a conversation with the given ID.",
+                "description": "Get a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -369,8 +369,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Update a conversation.",
-                "description": "Update a conversation.\nUpdate a conversation's metadata with the given ID.",
+                "summary": "Update a conversation's metadata with the given ID.",
+                "description": "Update a conversation's metadata with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -422,8 +422,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete a conversation.",
-                "description": "Delete a conversation.\nDelete a conversation with the given ID.",
+                "summary": "Delete a conversation with the given ID.",
+                "description": "Delete a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -467,8 +467,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "List items.",
-                "description": "List items.\nList items in the conversation.",
+                "summary": "List items in the conversation.",
+                "description": "List items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -483,53 +483,86 @@
                        "name": "after",
                        "in": "query",
                        "description": "An item ID to list items after, used in pagination.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string"
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "include",
                        "in": "query",
                        "description": "Specify additional output data to include in the response.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string",
-                                "enum": [
-                                    "web_search_call.action.sources",
-                                    "code_interpreter_call.outputs",
-                                    "computer_call_output.output.image_url",
-                                    "file_search_call.results",
-                                    "message.input_image.image_url",
-                                    "message.output_text.logprobs",
-                                    "reasoning.encrypted_content"
-                                ],
-                                "title": "ConversationItemInclude",
-                                "description": "Specify additional output data to include in the model response."
-                            }
+                            "oneOf": [
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "string",
+                                        "enum": [
+                                            "code_interpreter_call.outputs",
+                                            "computer_call_output.output.image_url",
+                                            "file_search_call.results",
+                                            "message.input_image.image_url",
+                                            "message.output_text.logprobs",
+                                            "reasoning.encrypted_content"
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "A limit on the number of objects to be returned (1-100, default 20).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "integer"
+                            "oneOf": [
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "order",
                        "in": "query",
                        "description": "The order to return items in (asc or desc, default desc).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string",
-                            "enum": [
-                                "asc",
-                                "desc"
+                            "oneOf": [
+                                {
+                                    "type": "string",
+                                    "enum": [
+                                        "asc",
+                                        "desc"
+                                    ]
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
                            ]
                        }
                    }
@ -564,8 +597,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Create items.",
-                "description": "Create items.\nCreate items in the conversation.",
+                "summary": "Create items in the conversation.",
+                "description": "Create items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -619,8 +652,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Retrieve an item.",
-                "description": "Retrieve an item.\nRetrieve a conversation item.",
+                "summary": "Retrieve a conversation item.",
+                "description": "Retrieve a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -671,8 +704,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete an item.",
-                "description": "Delete an item.\nDelete a conversation item.",
+                "summary": "Delete a conversation item.",
+                "description": "Delete a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -6767,7 +6800,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -6826,8 +6859,7 @@
                "type": "string",
                "enum": [
                    "llm",
-                    "embedding",
-                    "rerank"
+                    "embedding"
                ],
                "title": "ModelType",
                "description": "Enumeration of supported model types in Llama Stack."
@ -7568,10 +7600,6 @@
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
-                    },
                    "input": {
                        "type": "array",
                        "items": {
@ -8120,10 +8148,6 @@
                    "usage": {
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
-                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
                    }
                },
                "additionalProperties": false,
@ -10173,7 +10197,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -10655,7 +10679,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -11708,7 +11732,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -13227,8 +13251,8 @@
        },
        {
            "name": "Conversations",
-            "description": "Protocol for conversation management operations.",
-            "x-displayName": "Conversations"
+            "description": "",
+            "x-displayName": "Protocol for conversation management operations."
        },
        {
            "name": "Files",
@ -13237,7 +13261,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -192,10 +192,7 @@ paths:
      tags:
        - Conversations
      summary: Create a conversation.
-      description: >-
-        Create a conversation.
-
-        Create a conversation.
+      description: Create a conversation.
      parameters: []
      requestBody:
        content:
@ -225,11 +222,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Retrieve a conversation.
-      description: >-
-        Retrieve a conversation.
-
-        Get a conversation with the given ID.
+      summary: Get a conversation with the given ID.
+      description: Get a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -258,10 +252,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Update a conversation.
+      summary: >-
+        Update a conversation's metadata with the given ID.
      description: >-
-        Update a conversation.
-
        Update a conversation's metadata with the given ID.
      parameters:
        - name: conversation_id
@ -297,11 +290,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete a conversation.
-      description: >-
-        Delete a conversation.
-
-        Delete a conversation with the given ID.
+      summary: Delete a conversation with the given ID.
+      description: Delete a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -331,11 +321,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: List items.
-      description: >-
-        List items.
-
-        List items in the conversation.
+      summary: List items in the conversation.
+      description: List items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -347,46 +334,146 @@ paths:
          in: query
          description: >-
            An item ID to list items after, used in pagination.
-          required: false
+          required: true
          schema:
-            type: string
+            oneOf:
+              - type: string
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: include
          in: query
          description: >-
            Specify additional output data to include in the response.
-          required: false
+          required: true
          schema:
-            type: array
-            items:
-              type: string
-              enum:
-                - web_search_call.action.sources
-                - code_interpreter_call.outputs
-                - computer_call_output.output.image_url
-                - file_search_call.results
-                - message.input_image.image_url
-                - message.output_text.logprobs
-                - reasoning.encrypted_content
-              title: ConversationItemInclude
-              description: >-
-                Specify additional output data to include in the model response.
+            oneOf:
+              - type: array
+                items:
+                  type: string
+                  enum:
+                    - code_interpreter_call.outputs
+                    - computer_call_output.output.image_url
+                    - file_search_call.results
+                    - message.input_image.image_url
+                    - message.output_text.logprobs
+                    - reasoning.encrypted_content
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: limit
          in: query
          description: >-
            A limit on the number of objects to be returned (1-100, default 20).
-          required: false
+          required: true
          schema:
-            type: integer
+            oneOf:
+              - type: integer
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: order
          in: query
          description: >-
            The order to return items in (asc or desc, default desc).
-          required: false
+          required: true
          schema:
-            type: string
-            enum:
-              - asc
-              - desc
+            oneOf:
+              - type: string
+                enum:
+                  - asc
+                  - desc
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
      deprecated: false
    post:
      responses:
@ -408,11 +495,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Create items.
-      description: >-
-        Create items.
-
-        Create items in the conversation.
+      summary: Create items in the conversation.
+      description: Create items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -448,11 +532,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Retrieve an item.
-      description: >-
-        Retrieve an item.
-
-        Retrieve a conversation item.
+      summary: Retrieve a conversation item.
+      description: Retrieve a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -487,11 +568,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete an item.
-      description: >-
-        Delete an item.
-
-        Delete a conversation item.
+      summary: Delete a conversation item.
+      description: Delete a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -5127,7 +5205,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -5169,7 +5247,6 @@ components:
      enum:
        - llm
        - embedding
-        - rerank
      title: ModelType
      description: >-
        Enumeration of supported model types in Llama Stack.
@ -5716,10 +5793,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
        input:
          type: array
          items:
@ -6123,10 +6196,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
      additionalProperties: false
      required:
        - created_at
@ -7820,7 +7889,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -8128,7 +8197,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -8891,7 +8960,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -10077,9 +10146,9 @@ tags:
      - `background`
    x-displayName: Agents
  - name: Conversations
-    description: >-
+    description: ''
+    x-displayName: >-
      Protocol for conversation management operations.
-    x-displayName: Conversations
  - name: Files
    description: >-
      This API is used to upload documents that can be used with other Llama Stack
@ -10091,16 +10160,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Inspect
    description: >-
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -282,7 +282,7 @@
                    "Conversations"
                ],
                "summary": "Create a conversation.",
-                "description": "Create a conversation.\nCreate a conversation.",
+                "description": "Create a conversation.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -326,8 +326,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Retrieve a conversation.",
-                "description": "Retrieve a conversation.\nGet a conversation with the given ID.",
+                "summary": "Get a conversation with the given ID.",
+                "description": "Get a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -369,8 +369,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Update a conversation.",
-                "description": "Update a conversation.\nUpdate a conversation's metadata with the given ID.",
+                "summary": "Update a conversation's metadata with the given ID.",
+                "description": "Update a conversation's metadata with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -422,8 +422,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete a conversation.",
-                "description": "Delete a conversation.\nDelete a conversation with the given ID.",
+                "summary": "Delete a conversation with the given ID.",
+                "description": "Delete a conversation with the given ID.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -467,8 +467,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "List items.",
-                "description": "List items.\nList items in the conversation.",
+                "summary": "List items in the conversation.",
+                "description": "List items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -483,53 +483,86 @@
                        "name": "after",
                        "in": "query",
                        "description": "An item ID to list items after, used in pagination.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string"
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "include",
                        "in": "query",
                        "description": "Specify additional output data to include in the response.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string",
-                                "enum": [
-                                    "web_search_call.action.sources",
-                                    "code_interpreter_call.outputs",
-                                    "computer_call_output.output.image_url",
-                                    "file_search_call.results",
-                                    "message.input_image.image_url",
-                                    "message.output_text.logprobs",
-                                    "reasoning.encrypted_content"
-                                ],
-                                "title": "ConversationItemInclude",
-                                "description": "Specify additional output data to include in the model response."
-                            }
+                            "oneOf": [
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "string",
+                                        "enum": [
+                                            "code_interpreter_call.outputs",
+                                            "computer_call_output.output.image_url",
+                                            "file_search_call.results",
+                                            "message.input_image.image_url",
+                                            "message.output_text.logprobs",
+                                            "reasoning.encrypted_content"
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "A limit on the number of objects to be returned (1-100, default 20).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "integer"
+                            "oneOf": [
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "order",
                        "in": "query",
                        "description": "The order to return items in (asc or desc, default desc).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string",
-                            "enum": [
-                                "asc",
-                                "desc"
+                            "oneOf": [
+                                {
+                                    "type": "string",
+                                    "enum": [
+                                        "asc",
+                                        "desc"
+                                    ]
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
                            ]
                        }
                    }
@ -564,8 +597,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Create items.",
-                "description": "Create items.\nCreate items in the conversation.",
+                "summary": "Create items in the conversation.",
+                "description": "Create items in the conversation.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -619,8 +652,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Retrieve an item.",
-                "description": "Retrieve an item.\nRetrieve a conversation item.",
+                "summary": "Retrieve a conversation item.",
+                "description": "Retrieve a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -671,8 +704,8 @@
                "tags": [
                    "Conversations"
                ],
-                "summary": "Delete an item.",
-                "description": "Delete an item.\nDelete a conversation item.",
+                "summary": "Delete a conversation item.",
+                "description": "Delete a conversation item.",
                "parameters": [
                    {
                        "name": "conversation_id",
@ -8439,7 +8472,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -8498,8 +8531,7 @@
                "type": "string",
                "enum": [
                    "llm",
-                    "embedding",
-                    "rerank"
+                    "embedding"
                ],
                "title": "ModelType",
                "description": "Enumeration of supported model types in Llama Stack."
@ -9240,10 +9272,6 @@
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
-                    },
                    "input": {
                        "type": "array",
                        "items": {
@ -9792,10 +9820,6 @@
                    "usage": {
                        "$ref": "#/components/schemas/OpenAIResponseUsage",
                        "description": "(Optional) Token usage information for the response"
-                    },
-                    "instructions": {
-                        "type": "string",
-                        "description": "(Optional) System message inserted into the model's context"
                    }
                },
                "additionalProperties": false,
@ -11845,7 +11869,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -12327,7 +12351,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -13380,7 +13404,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -14927,7 +14951,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -16672,7 +16696,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -17904,8 +17928,8 @@
        },
        {
            "name": "Conversations",
-            "description": "Protocol for conversation management operations.",
-            "x-displayName": "Conversations"
+            "description": "",
+            "x-displayName": "Protocol for conversation management operations."
        },
        {
            "name": "DatasetIO",
@ -17917,8 +17941,8 @@
        },
        {
            "name": "Eval",
-            "description": "Llama Stack Evaluation API for running evaluations on model and agent candidates.",
-            "x-displayName": "Evaluations"
+            "description": "",
+            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
        },
        {
            "name": "Files",
@ -17927,7 +17951,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -195,10 +195,7 @@ paths:
      tags:
        - Conversations
      summary: Create a conversation.
-      description: >-
-        Create a conversation.
-
-        Create a conversation.
+      description: Create a conversation.
      parameters: []
      requestBody:
        content:
@ -228,11 +225,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Retrieve a conversation.
-      description: >-
-        Retrieve a conversation.
-
-        Get a conversation with the given ID.
+      summary: Get a conversation with the given ID.
+      description: Get a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -261,10 +255,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Update a conversation.
+      summary: >-
+        Update a conversation's metadata with the given ID.
      description: >-
-        Update a conversation.
-
        Update a conversation's metadata with the given ID.
      parameters:
        - name: conversation_id
@ -300,11 +293,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete a conversation.
-      description: >-
-        Delete a conversation.
-
-        Delete a conversation with the given ID.
+      summary: Delete a conversation with the given ID.
+      description: Delete a conversation with the given ID.
      parameters:
        - name: conversation_id
          in: path
@ -334,11 +324,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: List items.
-      description: >-
-        List items.
-
-        List items in the conversation.
+      summary: List items in the conversation.
+      description: List items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -350,46 +337,146 @@ paths:
          in: query
          description: >-
            An item ID to list items after, used in pagination.
-          required: false
+          required: true
          schema:
-            type: string
+            oneOf:
+              - type: string
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: include
          in: query
          description: >-
            Specify additional output data to include in the response.
-          required: false
+          required: true
          schema:
-            type: array
-            items:
-              type: string
-              enum:
-                - web_search_call.action.sources
-                - code_interpreter_call.outputs
-                - computer_call_output.output.image_url
-                - file_search_call.results
-                - message.input_image.image_url
-                - message.output_text.logprobs
-                - reasoning.encrypted_content
-              title: ConversationItemInclude
-              description: >-
-                Specify additional output data to include in the model response.
+            oneOf:
+              - type: array
+                items:
+                  type: string
+                  enum:
+                    - code_interpreter_call.outputs
+                    - computer_call_output.output.image_url
+                    - file_search_call.results
+                    - message.input_image.image_url
+                    - message.output_text.logprobs
+                    - reasoning.encrypted_content
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: limit
          in: query
          description: >-
            A limit on the number of objects to be returned (1-100, default 20).
-          required: false
+          required: true
          schema:
-            type: integer
+            oneOf:
+              - type: integer
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: order
          in: query
          description: >-
            The order to return items in (asc or desc, default desc).
-          required: false
+          required: true
          schema:
-            type: string
-            enum:
-              - asc
-              - desc
+            oneOf:
+              - type: string
+                enum:
+                  - asc
+                  - desc
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
      deprecated: false
    post:
      responses:
@ -411,11 +498,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Create items.
-      description: >-
-        Create items.
-
-        Create items in the conversation.
+      summary: Create items in the conversation.
+      description: Create items in the conversation.
      parameters:
        - name: conversation_id
          in: path
@ -451,11 +535,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Retrieve an item.
-      description: >-
-        Retrieve an item.
-
-        Retrieve a conversation item.
+      summary: Retrieve a conversation item.
+      description: Retrieve a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -490,11 +571,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Conversations
-      summary: Delete an item.
-      description: >-
-        Delete an item.
-
-        Delete a conversation item.
+      summary: Delete a conversation item.
+      description: Delete a conversation item.
      parameters:
        - name: conversation_id
          in: path
@ -6340,7 +6418,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -6382,7 +6460,6 @@ components:
      enum:
        - llm
        - embedding
-        - rerank
      title: ModelType
      description: >-
        Enumeration of supported model types in Llama Stack.
@ -6929,10 +7006,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
        input:
          type: array
          items:
@ -7336,10 +7409,6 @@ components:
          $ref: '#/components/schemas/OpenAIResponseUsage'
          description: >-
            (Optional) Token usage information for the response
-        instructions:
-          type: string
-          description: >-
-            (Optional) System message inserted into the model's context
      additionalProperties: false
      required:
        - created_at
@ -9033,7 +9102,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -9341,7 +9410,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -10104,7 +10173,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -11226,7 +11295,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -12553,7 +12622,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -13464,17 +13533,17 @@ tags:
  - name: Benchmarks
    description: ''
  - name: Conversations
-    description: >-
+    description: ''
+    x-displayName: >-
      Protocol for conversation management operations.
-    x-displayName: Conversations
  - name: DatasetIO
    description: ''
  - name: Datasets
    description: ''
  - name: Eval
-    description: >-
+    description: ''
+    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
-    x-displayName: Evaluations
  - name: Files
    description: >-
      This API is used to upload documents that can be used with other Llama Stack
@ -13486,16 +13555,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Inspect
    description: >-
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -78,14 +78,17 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next

 ## Build, Configure, and Run Llama Stack

-1. **Install dependencies**:
+1. **Build the Llama Stack**:
+   Build the Llama Stack using the `starter` template:
   ```bash
-   llama stack list-deps starter | xargs -L1 uv pip install
+   uv run --with llama-stack llama stack build --distro starter --image-type venv
   ```
-
-2. **Start the distribution**:
+   **Expected Output:**
   ```bash
-   llama stack run starter
+   ...
+   Build Successful!
+   You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
+   You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter
   ```

 3. **Set the ENV variables by exporting them to the terminal**:
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -545,7 +545,6 @@ class OpenAIResponseObject(BaseModel):
    :param tools: (Optional) An array of tools the model may call while generating a response.
    :param truncation: (Optional) Truncation strategy applied to the response
    :param usage: (Optional) Token usage information for the response
-    :param instructions: (Optional) System message inserted into the model's context
    """

    created_at: int
@ -565,7 +564,6 @@ class OpenAIResponseObject(BaseModel):
    tools: list[OpenAIResponseTool] | None = None
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
-    instructions: str | None = None


@json_schema_type
--- a/llama_stack/apis/conversations/conversations.py
+++ b/llama_stack/apis/conversations/conversations.py
@ -4,9 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import StrEnum
 from typing import Annotated, Literal, Protocol, runtime_checkable

+from openai import NOT_GIVEN
+from openai._types import NotGiven
+from openai.types.responses.response_includable import ResponseIncludable
 from pydantic import BaseModel, Field

 from llama_stack.apis.agents.openai_responses import (
@ -148,20 +150,6 @@ class ConversationItemCreateRequest(BaseModel):
    )


-class ConversationItemInclude(StrEnum):
-    """
-    Specify additional output data to include in the model response.
-    """
-
-    web_search_call_action_sources = "web_search_call.action.sources"
-    code_interpreter_call_outputs = "code_interpreter_call.outputs"
-    computer_call_output_output_image_url = "computer_call_output.output.image_url"
-    file_search_call_results = "file_search_call.results"
-    message_input_image_image_url = "message.input_image.image_url"
-    message_output_text_logprobs = "message.output_text.logprobs"
-    reasoning_encrypted_content = "reasoning.encrypted_content"
-
-
@json_schema_type
 class ConversationItemList(BaseModel):
    """List of conversation items with pagination."""
@ -185,9 +173,7 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable
@trace_protocol
 class Conversations(Protocol):
-    """Conversations
-
-    Protocol for conversation management operations."""
+    """Protocol for conversation management operations."""

    @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
    async def create_conversation(
@ -195,8 +181,6 @@ class Conversations(Protocol):
    ) -> Conversation:
        """Create a conversation.

-        Create a conversation.
-
        :param items: Initial items to include in the conversation context.
        :param metadata: Set of key-value pairs that can be attached to an object.
        :returns: The created conversation object.
@ -205,9 +189,7 @@ class Conversations(Protocol):

    @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_conversation(self, conversation_id: str) -> Conversation:
-        """Retrieve a conversation.
-
-        Get a conversation with the given ID.
+        """Get a conversation with the given ID.

        :param conversation_id: The conversation identifier.
        :returns: The conversation object.
@ -216,9 +198,7 @@ class Conversations(Protocol):

    @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
-        """Update a conversation.
-
-        Update a conversation's metadata with the given ID.
+        """Update a conversation's metadata with the given ID.

        :param conversation_id: The conversation identifier.
        :param metadata: Set of key-value pairs that can be attached to an object.
@ -228,9 +208,7 @@ class Conversations(Protocol):

    @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
-        """Delete a conversation.
-
-        Delete a conversation with the given ID.
+        """Delete a conversation with the given ID.

        :param conversation_id: The conversation identifier.
        :returns: The deleted conversation resource.
@ -239,9 +217,7 @@ class Conversations(Protocol):

    @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
-        """Create items.
-
-        Create items in the conversation.
+        """Create items in the conversation.

        :param conversation_id: The conversation identifier.
        :param items: Items to include in the conversation context.
@ -251,9 +227,7 @@ class Conversations(Protocol):

    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
-        """Retrieve an item.
-
-        Retrieve a conversation item.
+        """Retrieve a conversation item.

        :param conversation_id: The conversation identifier.
        :param item_id: The item identifier.
@ -262,17 +236,15 @@ class Conversations(Protocol):
        ...

    @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_items(
+    async def list(
        self,
        conversation_id: str,
-        after: str | None = None,
-        include: list[ConversationItemInclude] | None = None,
-        limit: int | None = None,
-        order: Literal["asc", "desc"] | None = None,
+        after: str | NotGiven = NOT_GIVEN,
+        include: list[ResponseIncludable] | NotGiven = NOT_GIVEN,
+        limit: int | NotGiven = NOT_GIVEN,
+        order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
    ) -> ConversationItemList:
-        """List items.
-
-        List items in the conversation.
+        """List items in the conversation.

        :param conversation_id: The conversation identifier.
        :param after: An item ID to list items after, used in pagination.
@ -287,9 +259,7 @@ class Conversations(Protocol):
    async def openai_delete_conversation_item(
        self, conversation_id: str, item_id: str
    ) -> ConversationItemDeletedResource:
-        """Delete an item.
-
-        Delete a conversation item.
+        """Delete a conversation item.

        :param conversation_id: The conversation identifier.
        :param item_id: The item identifier.
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -121,7 +121,6 @@ class Api(Enum, metaclass=DynamicApiMeta):

    models = "models"
    shields = "shields"
-    vector_stores = "vector_stores"  # only used for routing table
    datasets = "datasets"
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -82,9 +82,7 @@ class EvaluateResponse(BaseModel):


 class Eval(Protocol):
-    """Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
+    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1234,10 +1234,9 @@ class Inference(InferenceProvider):

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.
    """

    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -27,12 +27,10 @@ class ModelType(StrEnum):
    """Enumeration of supported model types in Llama Stack.
    :cvar llm: Large language model for text generation and completion
    :cvar embedding: Embedding model for converting text to vector representations
-    :cvar rerank: Reranking model for reordering documents based on their relevance to a query
    """

    llm = "llm"
    embedding = "embedding"
-    rerank = "rerank"


@json_schema_type
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 class ResourceType(StrEnum):
    model = "model"
    shield = "shield"
-    vector_store = "vector_store"
+    vector_db = "vector_db"
    dataset = "dataset"
    scoring_function = "scoring_function"
    benchmark = "benchmark"
@ -34,4 +34,4 @@ class Resource(BaseModel):

    provider_id: str = Field(description="ID of the provider that owns this resource")

-    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")
+    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")
--- a/llama_stack/apis/vector_dbs/init.py
+++ b/llama_stack/apis/vector_dbs/init.py
@ -3,3 +3,5 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+from .vector_dbs import *
--- a/llama_stack/apis/vector_stores/vector_stores.py
+++ b/llama_stack/apis/vector_stores/vector_stores.py
@ -9,43 +9,53 @@ from typing import Literal
 from pydantic import BaseModel

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.schema_utils import json_schema_type


-# Internal resource type for storing the vector store routing and other information
-class VectorStore(Resource):
+@json_schema_type
+class VectorDB(Resource):
    """Vector database resource for storing and querying vector embeddings.

-    :param type: Type of resource, always 'vector_store' for vector stores
+    :param type: Type of resource, always 'vector_db' for vector databases
    :param embedding_model: Name of the embedding model to use for vector generation
    :param embedding_dimension: Dimension of the embedding vectors
    """

-    type: Literal[ResourceType.vector_store] = ResourceType.vector_store
+    type: Literal[ResourceType.vector_db] = ResourceType.vector_db

    embedding_model: str
    embedding_dimension: int
-    vector_store_name: str | None = None
+    vector_db_name: str | None = None

    @property
-    def vector_store_id(self) -> str:
+    def vector_db_id(self) -> str:
        return self.identifier

    @property
-    def provider_vector_store_id(self) -> str | None:
+    def provider_vector_db_id(self) -> str | None:
        return self.provider_resource_id


-class VectorStoreInput(BaseModel):
+class VectorDBInput(BaseModel):
    """Input parameters for creating or configuring a vector database.

-    :param vector_store_id: Unique identifier for the vector store
+    :param vector_db_id: Unique identifier for the vector database
    :param embedding_model: Name of the embedding model to use for vector generation
    :param embedding_dimension: Dimension of the embedding vectors
-    :param provider_vector_store_id: (Optional) Provider-specific identifier for the vector store
+    :param provider_vector_db_id: (Optional) Provider-specific identifier for the vector database
    """

-    vector_store_id: str
+    vector_db_id: str
    embedding_model: str
    embedding_dimension: int
    provider_id: str | None = None
-    provider_vector_store_id: str | None = None
+    provider_vector_db_id: str | None = None
+
+
+class ListVectorDBsResponse(BaseModel):
+    """Response from listing vector databases.
+
+    :param data: List of vector databases
+    """
+
+    data: list[VectorDB]
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -15,7 +15,7 @@ from fastapi import Body
 from pydantic import BaseModel, Field

 from llama_stack.apis.inference import InterleavedContent
-from llama_stack.apis.vector_stores import VectorStore
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
@ -140,7 +140,6 @@ class VectorStoreFileCounts(BaseModel):
    total: int


-# TODO: rename this as OpenAIVectorStore
@json_schema_type
 class VectorStoreObject(BaseModel):
    """OpenAI Vector Store object.
@ -518,18 +517,17 @@ class OpenAICreateVectorStoreFileBatchRequestWithExtraBody(BaseModel, extra="all
    chunking_strategy: VectorStoreChunkingStrategy | None = None


-class VectorStoreTable(Protocol):
-    def get_vector_store(self, vector_store_id: str) -> VectorStore | None: ...
+class VectorDBStore(Protocol):
+    def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...


@runtime_checkable
@trace_protocol
 class VectorIO(Protocol):
-    vector_store_table: VectorStoreTable | None = None
+    vector_db_store: VectorDBStore | None = None

    # this will just block now until chunks are inserted, but it should
    # probably return a Job instance which can be polled for completion
-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
    async def insert_chunks(
        self,
@ -548,7 +546,6 @@ class VectorIO(Protocol):
        """
        ...

-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
    async def query_chunks(
        self,
--- a/llama_stack/apis/vector_stores/init.py
+++ b/llama_stack/apis/vector_stores/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_stores import *
--- a/Show more
+++ b/Show more