Release candidate 0.3.0rc4

2025-10-24 08:47:26 +00:00 · 2025-10-20 21:58:12 +00:00
177 changed files with 5066 additions and 53159 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -86,9 +86,10 @@ runs:
      if: ${{ always() }}
      shell: bash
      run: |
-        # Ollama logs (if ollama container exists)
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
-        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        distro_name=$(echo "${{ inputs.stack-config }}" | sed 's/^docker://' | sed 's/^server://')
+        stack_container_name="llama-stack-test-$distro_name"
+        sudo docker logs $stack_container_name > docker-${distro_name}-${{ inputs.inference-mode }}.log || true

    - name: Upload logs
      if: ${{ always() }}
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -47,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, docker]
+        client-type: [library, server, docker]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
@ -61,7 +61,7 @@ jobs:
              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
            || github.event.inputs.test-setup == 'ollama-vision'
              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
-            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}, {"setup": "gpt", "suite": "responses"}]')
+            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
          }}

    steps:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml

      - name: Set up Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -99,7 +99,7 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
+              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
            });

      - name: Checkout PR branch (same-repo)
@ -141,7 +141,7 @@ jobs:

      - name: Set up Node.js
        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -36,7 +36,7 @@ jobs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Generate Distribution List
        id: set-matrix
@ -55,7 +55,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -79,7 +79,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -92,7 +92,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
+      uses: astral-sh/setup-uv@eb1897b8dc4b5d5bfe39a428a8f2304605e0983c # v7.0.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

      - name: Setup Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -27,24 +27,28 @@ providers:
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
+      kvstore:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -54,15 +58,20 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -103,45 +112,32 @@ storage:
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  stores:
+  references:
    metadata:
-      namespace: registry
      backend: kv_default
+      namespace: registry
    inference:
+      backend: sql_default
      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-registered_resources:
-  models:
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  - model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-    model_type: llm
-  shields:
-  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+models:
+- metadata:
+    embedding_dimension: 768
+  model_id: nomic-embed-text-v1.5
+  provider_id: sentence-transformers
+  model_type: embedding
+- model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
 server:
  port: 8323
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: chromadb
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -208,6 +208,19 @@ resources:
            type: http
            endpoint: post /v1/conversations/{conversation_id}/items

+  datasets:
+    models:
+      list_datasets_response: ListDatasetsResponse
+    methods:
+      register: post /v1beta/datasets
+      retrieve: get /v1beta/datasets/{dataset_id}
+      list:
+        endpoint: get /v1beta/datasets
+        paginated: false
+      unregister: delete /v1beta/datasets/{dataset_id}
+      iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
+      appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
+
  inspect:
    models:
      healthInfo: HealthInfo
@ -508,21 +521,6 @@ resources:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream

-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            endpoint: get /v1beta/datasets
-            paginated: false
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-

 settings:
  license: MIT
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -350,46 +350,146 @@ paths:
          in: query
          description: >-
            An item ID to list items after, used in pagination.
-          required: false
+          required: true
          schema:
-            type: string
+            oneOf:
+              - type: string
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: include
          in: query
          description: >-
            Specify additional output data to include in the response.
-          required: false
+          required: true
          schema:
-            type: array
-            items:
-              type: string
-              enum:
-                - web_search_call.action.sources
-                - code_interpreter_call.outputs
-                - computer_call_output.output.image_url
-                - file_search_call.results
-                - message.input_image.image_url
-                - message.output_text.logprobs
-                - reasoning.encrypted_content
-              title: ConversationItemInclude
-              description: >-
-                Specify additional output data to include in the model response.
+            oneOf:
+              - type: array
+                items:
+                  type: string
+                  enum:
+                    - code_interpreter_call.outputs
+                    - computer_call_output.output.image_url
+                    - file_search_call.results
+                    - message.input_image.image_url
+                    - message.output_text.logprobs
+                    - reasoning.encrypted_content
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: limit
          in: query
          description: >-
            A limit on the number of objects to be returned (1-100, default 20).
-          required: false
+          required: true
          schema:
-            type: integer
+            oneOf:
+              - type: integer
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: order
          in: query
          description: >-
            The order to return items in (asc or desc, default desc).
-          required: false
+          required: true
          schema:
-            type: string
-            enum:
-              - asc
-              - desc
+            oneOf:
+              - type: string
+                enum:
+                  - asc
+                  - desc
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
      deprecated: false
    post:
      responses:
@ -6340,7 +6440,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -6382,7 +6482,6 @@ components:
      enum:
        - llm
        - embedding
-        - rerank
      title: ModelType
      description: >-
        Enumeration of supported model types in Llama Stack.
@ -6443,10 +6542,11 @@ components:
        model:
          type: string
          description: >-
-            (Optional) The content moderation model you would like to use.
+            The content moderation model you would like to use.
      additionalProperties: false
      required:
        - input
+        - model
      title: RunModerationRequest
    ModerationObject:
      type: object
@ -9032,7 +9132,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -9340,7 +9440,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -10103,7 +10203,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -11225,7 +11325,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -12552,7 +12652,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -13485,16 +13585,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Inspect
    description: >-
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -45,7 +45,7 @@ RUN set -eux; \
        exit 1; \
    fi

-RUN pip install --no-cache uv
+RUN pip install --no-cache-dir uv
 ENV UV_SYSTEM_PYTHON=1

 ENV INSTALL_MODE=${INSTALL_MODE}
@ -68,7 +68,7 @@ RUN set -eux; \
            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
            exit 1; \
        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
+        uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
    fi;

 # Install llama-stack
@ -78,19 +78,19 @@ RUN set -eux; \
            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
            exit 1; \
        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
+        uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
-        uv pip install --no-cache fastapi libcst; \
+        uv pip install --no-cache-dir fastapi libcst; \
        if [ -n "$TEST_PYPI_VERSION" ]; then \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
+            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
        else \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
+            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
        fi; \
    else \
        if [ -n "$PYPI_VERSION" ]; then \
-            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
+            uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
        else \
-            uv pip install --no-cache llama-stack; \
+            uv pip install --no-cache-dir llama-stack; \
        fi; \
    fi;

@ -102,7 +102,7 @@ RUN set -eux; \
    fi; \
    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
    if [ -n "$deps" ]; then \
-        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
+        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
    fi

 # Cleanup
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -19,7 +19,6 @@ Browse that folder to understand available providers and copy a distribution to

 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-
 <Tabs>
 <TabItem value="container" label="Building a container">

--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -32,17 +32,21 @@ providers:
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
+      kvstore:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -52,15 +56,20 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -101,53 +110,40 @@ storage:
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  stores:
+  references:
    metadata:
-      namespace: registry
      backend: kv_default
+      namespace: registry
    inference:
+      backend: sql_default
      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-registered_resources:
-  models:
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-    model_type: llm
-  - metadata: {}
-    model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    provider_id: vllm-safety
-    model_type: llm
-  shields:
-  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+models:
+- metadata:
+    embedding_dimension: 768
+  model_id: nomic-embed-text-v1.5
+  provider_id: sentence-transformers
+  model_type: embedding
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+  provider_id: vllm-safety
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
 server:
  port: 8321
  auth:
    provider_config:
      type: github_token
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: chromadb
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -4,24 +4,65 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-import io, requests
-from openai import OpenAI
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+models = client.models.list()

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

-print(resp)
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
--- a/docs/docs/getting_started/quickstart.mdx
+++ b/docs/docs/getting_started/quickstart.mdx
@ -35,51 +35,103 @@ OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run star
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python
-import io, requests
-from openai import OpenAI
+```python title="demo_script.py"
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

+prompt = "How do you do great work?"
+print("prompt>", prompt)

+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
+```
 We will use `uv` to run the script
 ```
 uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
-```python
->print(resp.output[1].content[0].text)
-To do great work, consider the following principles:
-
-1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
-
-2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
-
-3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
-
-4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.
-
-5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
-
-6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
-
-By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
 ```
+rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html

+prompt> How do you do great work?
+
+inference> [knowledge_search(query="What is the key to doing great work")]
+
+tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
+
+tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
+
+inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
+
+To further clarify, I would suggest that doing great work involves:
+
+* Completing tasks with high quality and attention to detail
+* Expanding on existing knowledge or ideas
+* Making a positive impact on others through your work
+* Striving for excellence and continuous improvement
+
+Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
+```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

 :::tip HuggingFace access
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -3,10 +3,9 @@ description: "Inference

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query."
+    - Embedding models: these models generate embeddings to be used for semantic search."
 sidebar_label: Inference
 title: Inference
 ---
@ -19,9 +18,8 @@ Inference

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.

 This section contains documentation for all available providers for the **inference** API.
--- a/docs/docs/references/llama_stack_client_cli_reference.md
+++ b/docs/docs/references/llama_stack_client_cli_reference.md
@ -32,6 +32,7 @@ Commands:
  scoring_functions  Manage scoring functions.
  shields            Manage safety shield services.
  toolgroups         Manage available tool groups.
+  vector_dbs         Manage vector databases.
 ```

 ### `llama-stack-client configure`
@ -210,6 +211,53 @@ Unregister a model from distribution endpoint
 llama-stack-client models unregister <model_id>
 ```

+## Vector DB Management
+Manage vector databases.
+
+
+### `llama-stack-client vector_dbs list`
+Show available vector dbs on distribution endpoint
+```bash
+llama-stack-client vector_dbs list
+```
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ identifier               ┃ provider_id ┃ provider_resource_id     ┃ vector_db_type ┃ params                            ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ my_demo_vector_db        │ faiss       │ my_demo_vector_db        │                │ embedding_dimension: 768          │
+│                          │             │                          │                │ embedding_model: nomic-embed-text-v1.5 │
+│                          │             │                          │                │ type: vector_db                   │
+│                          │             │                          │                │                                   │
+└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
+```
+
+### `llama-stack-client vector_dbs register`
+Create a new vector db
+```bash
+llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
+```
+
+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+Optional arguments:
+- `--provider-id`: Provider ID for the vector db
+- `--provider-vector-db-id`: Provider's vector db ID
+- `--embedding-model`: Embedding model to use. Default: `nomic-embed-text-v1.5`
+- `--embedding-dimension`: Dimension of embeddings. Default: 768
+
+### `llama-stack-client vector_dbs unregister`
+Delete a vector db
+```bash
+llama-stack-client vector_dbs unregister <vector-db-id>
+```
+
+
+Required arguments:
+- `VECTOR_DB_ID`: Vector DB ID
+
+
 ## Shield Management
 Manage safety shield services.
 ### `llama-stack-client shields list`
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -126,31 +126,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "J2kGed0R5PSf",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
+    "collapsed": true,
    "id": "J2kGed0R5PSf",
    "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m52 packages\u001b[0m \u001b[2min 1.56s\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 122ms\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 197ms\u001b[0m\u001b[0m\n",
-      "\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
-      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 11ms\u001b[0m\u001b[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
@ -164,7 +150,7 @@
    "def run_llama_stack_server_background():\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
-    "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\",\n",
+    "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
@ -214,7 +200,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
   "id": "f779283d",
   "metadata": {},
   "outputs": [
@ -222,8 +208,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Starting Llama Stack server with PID: 20778\n",
-      "Waiting for server to start........\n",
+      "Starting Llama Stack server with PID: 787100\n",
+      "Waiting for server to start\n",
      "Server is ready!\n"
     ]
    }
@ -243,84 +229,65 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
   "id": "7da71011",
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/files \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/vector_stores \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/conversations \"HTTP/1.1 200 OK\"\n",
-      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/responses \"HTTP/1.1 200 OK\"\n"
-     ]
-    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html\n",
      "prompt> How do you do great work?\n",
-      "🤔 Doing great work involves a combination of skills, habits, and mindsets. Here are some key principles:\n",
-      "\n",
-      "1. **Set Clear Goals**: Start with a clear vision of what you want to achieve. Define specific, measurable, achievable, relevant, and time-bound (SMART) goals.\n",
-      "\n",
-      "2. **Plan and Prioritize**: Break your goals into smaller, manageable tasks. Prioritize these tasks based on their importance and urgency.\n",
-      "\n",
-      "3. **Focus on Quality**: Aim for high-quality outcomes rather than just finishing tasks. Pay attention to detail, and ensure your work meets or exceeds standards.\n",
-      "\n",
-      "4. **Stay Organized**: Keep your workspace, both physical and digital, organized to help you stay focused and efficient.\n",
-      "\n",
-      "5. **Manage Your Time**: Use time management techniques such as the Pomodoro Technique, time blocking, or the Eisenhower Box to maximize productivity.\n",
-      "\n",
-      "6. **Seek Feedback and Learn**: Regularly seek feedback from peers, mentors, or supervisors. Use constructive criticism to improve continuously.\n",
-      "\n",
-      "7. **Innovate and Improve**: Look for ways to improve processes or introduce new ideas. Be open to change and willing to adapt.\n",
-      "\n",
-      "8. **Stay Motivated and Persistent**: Keep your end goals in mind to stay motivated. Overcome setbacks with resilience and persistence.\n",
-      "\n",
-      "9. **Balance and Rest**: Ensure you maintain a healthy work-life balance. Take breaks and manage stress to sustain long-term productivity.\n",
-      "\n",
-      "10. **Reflect and Adjust**: Regularly assess your progress and adjust your strategies as needed. Reflect on what works well and what doesn't.\n",
-      "\n",
-      "By incorporating these elements, you can consistently produce high-quality work and achieve excellence in your endeavors.\n"
+      "\u001b[33minference> \u001b[0m\u001b[33m[k\u001b[0m\u001b[33mnowledge\u001b[0m\u001b[33m_search\u001b[0m\u001b[33m(query\u001b[0m\u001b[33m=\"\u001b[0m\u001b[33mWhat\u001b[0m\u001b[33m is\u001b[0m\u001b[33m the\u001b[0m\u001b[33m key\u001b[0m\u001b[33m to\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m\")]\u001b[0m\u001b[97m\u001b[0m\n",
+      "\u001b[32mtool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}\u001b[0m\n",
+      "\u001b[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n', type='text'), TextContentItem(text=\"Result 1:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 2:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 3:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 4:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 5:\\nDocument_id:docum\\nContent:  work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text='END of knowledge_search tool results.\\n', type='text'), TextContentItem(text='The above results were retrieved to help answer the user\\'s query: \"What is the key to doing great work\". Use them as supporting information only in answering this query.\\n', type='text')]\u001b[0m\n",
+      "\u001b[33minference> \u001b[0m\u001b[33mDoing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m means\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m something\u001b[0m\u001b[33m important\u001b[0m\u001b[33m so\u001b[0m\u001b[33m well\u001b[0m\u001b[33m that\u001b[0m\u001b[33m you\u001b[0m\u001b[33m expand\u001b[0m\u001b[33m people\u001b[0m\u001b[33m's\u001b[0m\u001b[33m ideas\u001b[0m\u001b[33m of\u001b[0m\u001b[33m what\u001b[0m\u001b[33m's\u001b[0m\u001b[33m possible\u001b[0m\u001b[33m.\u001b[0m\u001b[33m However\u001b[0m\u001b[33m,\u001b[0m\u001b[33m there\u001b[0m\u001b[33m's\u001b[0m\u001b[33m no\u001b[0m\u001b[33m threshold\u001b[0m\u001b[33m for\u001b[0m\u001b[33m importance\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m's\u001b[0m\u001b[33m often\u001b[0m\u001b[33m hard\u001b[0m\u001b[33m to\u001b[0m\u001b[33m judge\u001b[0m\u001b[33m at\u001b[0m\u001b[33m the\u001b[0m\u001b[33m time\u001b[0m\u001b[33m anyway\u001b[0m\u001b[33m.\u001b[0m\u001b[33m Great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m is\u001b[0m\u001b[33m a\u001b[0m\u001b[33m matter\u001b[0m\u001b[33m of\u001b[0m\u001b[33m degree\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m can\u001b[0m\u001b[33m be\u001b[0m\u001b[33m difficult\u001b[0m\u001b[33m to\u001b[0m\u001b[33m determine\u001b[0m\u001b[33m whether\u001b[0m\u001b[33m someone\u001b[0m\u001b[33m has\u001b[0m\u001b[33m done\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m until\u001b[0m\u001b[33m after\u001b[0m\u001b[33m the\u001b[0m\u001b[33m fact\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
+      "\u001b[30m\u001b[0m"
     ]
    }
   ],
   "source": [
    "from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
-    "import requests\n",
    "\n",
-    "vector_store_id = \"my_demo_vector_db\"\n",
+    "vector_db_id = \"my_demo_vector_db\"\n",
    "client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
    "\n",
    "models = client.models.list()\n",
    "\n",
    "# Select the first ollama and first ollama's embedding model\n",
    "model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
+    "embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
+    "embedding_model_id = embedding_model.identifier\n",
+    "embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
    "\n",
-    "\n",
+    "_ = client.vector_dbs.register(\n",
+    "    vector_db_id=vector_db_id,\n",
+    "    embedding_model=embedding_model_id,\n",
+    "    embedding_dimension=embedding_dimension,\n",
+    "    provider_id=\"faiss\",\n",
+    ")\n",
    "source = \"https://www.paulgraham.com/greatwork.html\"\n",
-    "response = requests.get(source)\n",
-    "file = client.files.create(\n",
-    "    file=response.content,\n",
-    "    purpose='assistants'\n",
+    "print(\"rag_tool> Ingesting document:\", source)\n",
+    "document = RAGDocument(\n",
+    "    document_id=\"document_1\",\n",
+    "    content=source,\n",
+    "    mime_type=\"text/html\",\n",
+    "    metadata={},\n",
    ")\n",
-    "vector_store = client.vector_stores.create(\n",
-    "    name=vector_store_id,\n",
-    "    file_ids=[file.id],\n",
+    "client.tool_runtime.rag_tool.insert(\n",
+    "    documents=[document],\n",
+    "    vector_db_id=vector_db_id,\n",
+    "    chunk_size_in_tokens=50,\n",
    ")\n",
-    "\n",
    "agent = Agent(\n",
    "    client,\n",
    "    model=model_id,\n",
    "    instructions=\"You are a helpful assistant\",\n",
    "    tools=[\n",
    "        {\n",
-    "            \"type\": \"file_search\",\n",
-    "            \"vector_store_ids\": [vector_store_id],\n",
+    "            \"name\": \"builtin::rag/knowledge_search\",\n",
+    "            \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
    "        }\n",
    "    ],\n",
    ")\n",
@ -335,7 +302,7 @@
    ")\n",
    "\n",
    "for log in AgentEventLogger().log(response):\n",
-    "    print(log, end=\"\")"
+    "    log.print()"
   ]
  },
  {
@ -377,7 +344,7 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
@ -391,7 +358,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.12"
+   "version": "3.10.6"
  }
 },
 "nbformat": 4,
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -5547,7 +5547,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -5798,7 +5798,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -8185,12 +8185,13 @@
                    },
                    "model": {
                        "type": "string",
-                        "description": "(Optional) The content moderation model you would like to use."
+                        "description": "The content moderation model you would like to use."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "input"
+                    "input",
+                    "model"
                ],
                "title": "RunModerationRequest"
            },
@ -13466,7 +13467,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -4114,7 +4114,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -4303,7 +4303,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -6104,10 +6104,11 @@ components:
        model:
          type: string
          description: >-
-            (Optional) The content moderation model you would like to use.
+            The content moderation model you would like to use.
      additionalProperties: false
      required:
        - input
+        - model
      title: RunModerationRequest
    ModerationObject:
      type: object
@ -10217,16 +10218,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Models
    description: ''
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -1850,7 +1850,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -3983,7 +3983,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1320,7 +1320,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -2927,7 +2927,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -483,53 +483,86 @@
                        "name": "after",
                        "in": "query",
                        "description": "An item ID to list items after, used in pagination.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string"
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "include",
                        "in": "query",
                        "description": "Specify additional output data to include in the response.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string",
-                                "enum": [
-                                    "web_search_call.action.sources",
-                                    "code_interpreter_call.outputs",
-                                    "computer_call_output.output.image_url",
-                                    "file_search_call.results",
-                                    "message.input_image.image_url",
-                                    "message.output_text.logprobs",
-                                    "reasoning.encrypted_content"
-                                ],
-                                "title": "ConversationItemInclude",
-                                "description": "Specify additional output data to include in the model response."
-                            }
+                            "oneOf": [
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "string",
+                                        "enum": [
+                                            "code_interpreter_call.outputs",
+                                            "computer_call_output.output.image_url",
+                                            "file_search_call.results",
+                                            "message.input_image.image_url",
+                                            "message.output_text.logprobs",
+                                            "reasoning.encrypted_content"
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "A limit on the number of objects to be returned (1-100, default 20).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "integer"
+                            "oneOf": [
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "order",
                        "in": "query",
                        "description": "The order to return items in (asc or desc, default desc).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string",
-                            "enum": [
-                                "asc",
-                                "desc"
+                            "oneOf": [
+                                {
+                                    "type": "string",
+                                    "enum": [
+                                        "asc",
+                                        "desc"
+                                    ]
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
                            ]
                        }
                    }
@ -6767,7 +6800,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -6826,8 +6859,7 @@
                "type": "string",
                "enum": [
                    "llm",
-                    "embedding",
-                    "rerank"
+                    "embedding"
                ],
                "title": "ModelType",
                "description": "Enumeration of supported model types in Llama Stack."
@ -6919,12 +6951,13 @@
                    },
                    "model": {
                        "type": "string",
-                        "description": "(Optional) The content moderation model you would like to use."
+                        "description": "The content moderation model you would like to use."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "input"
+                    "input",
+                    "model"
                ],
                "title": "RunModerationRequest"
            },
@ -10172,7 +10205,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -10654,7 +10687,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -11707,7 +11740,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -13236,7 +13269,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -347,46 +347,146 @@ paths:
          in: query
          description: >-
            An item ID to list items after, used in pagination.
-          required: false
+          required: true
          schema:
-            type: string
+            oneOf:
+              - type: string
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: include
          in: query
          description: >-
            Specify additional output data to include in the response.
-          required: false
+          required: true
          schema:
-            type: array
-            items:
-              type: string
-              enum:
-                - web_search_call.action.sources
-                - code_interpreter_call.outputs
-                - computer_call_output.output.image_url
-                - file_search_call.results
-                - message.input_image.image_url
-                - message.output_text.logprobs
-                - reasoning.encrypted_content
-              title: ConversationItemInclude
-              description: >-
-                Specify additional output data to include in the model response.
+            oneOf:
+              - type: array
+                items:
+                  type: string
+                  enum:
+                    - code_interpreter_call.outputs
+                    - computer_call_output.output.image_url
+                    - file_search_call.results
+                    - message.input_image.image_url
+                    - message.output_text.logprobs
+                    - reasoning.encrypted_content
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: limit
          in: query
          description: >-
            A limit on the number of objects to be returned (1-100, default 20).
-          required: false
+          required: true
          schema:
-            type: integer
+            oneOf:
+              - type: integer
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: order
          in: query
          description: >-
            The order to return items in (asc or desc, default desc).
-          required: false
+          required: true
          schema:
-            type: string
-            enum:
-              - asc
-              - desc
+            oneOf:
+              - type: string
+                enum:
+                  - asc
+                  - desc
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
      deprecated: false
    post:
      responses:
@ -5127,7 +5227,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -5169,7 +5269,6 @@ components:
      enum:
        - llm
        - embedding
-        - rerank
      title: ModelType
      description: >-
        Enumeration of supported model types in Llama Stack.
@ -5230,10 +5329,11 @@ components:
        model:
          type: string
          description: >-
-            (Optional) The content moderation model you would like to use.
+            The content moderation model you would like to use.
      additionalProperties: false
      required:
        - input
+        - model
      title: RunModerationRequest
    ModerationObject:
      type: object
@ -7819,7 +7919,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -8127,7 +8227,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -8890,7 +8990,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -10090,16 +10190,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Inspect
    description: >-
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -483,53 +483,86 @@
                        "name": "after",
                        "in": "query",
                        "description": "An item ID to list items after, used in pagination.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string"
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "include",
                        "in": "query",
                        "description": "Specify additional output data to include in the response.",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string",
-                                "enum": [
-                                    "web_search_call.action.sources",
-                                    "code_interpreter_call.outputs",
-                                    "computer_call_output.output.image_url",
-                                    "file_search_call.results",
-                                    "message.input_image.image_url",
-                                    "message.output_text.logprobs",
-                                    "reasoning.encrypted_content"
-                                ],
-                                "title": "ConversationItemInclude",
-                                "description": "Specify additional output data to include in the model response."
-                            }
+                            "oneOf": [
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "string",
+                                        "enum": [
+                                            "code_interpreter_call.outputs",
+                                            "computer_call_output.output.image_url",
+                                            "file_search_call.results",
+                                            "message.input_image.image_url",
+                                            "message.output_text.logprobs",
+                                            "reasoning.encrypted_content"
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "A limit on the number of objects to be returned (1-100, default 20).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "integer"
+                            "oneOf": [
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
+                            ]
                        }
                    },
                    {
                        "name": "order",
                        "in": "query",
                        "description": "The order to return items in (asc or desc, default desc).",
-                        "required": false,
+                        "required": true,
                        "schema": {
-                            "type": "string",
-                            "enum": [
-                                "asc",
-                                "desc"
+                            "oneOf": [
+                                {
+                                    "type": "string",
+                                    "enum": [
+                                        "asc",
+                                        "desc"
+                                    ]
+                                },
+                                {
+                                    "type": "object",
+                                    "title": "NotGiven",
+                                    "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1)  # 1s timeout\nget(timeout=None)  # No timeout\nget()  # Default timeout behavior, which may not be statically known at the method definition.\n```"
+                                }
                            ]
                        }
                    }
@ -8439,7 +8472,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -8498,8 +8531,7 @@
                "type": "string",
                "enum": [
                    "llm",
-                    "embedding",
-                    "rerank"
+                    "embedding"
                ],
                "title": "ModelType",
                "description": "Enumeration of supported model types in Llama Stack."
@ -8591,12 +8623,13 @@
                    },
                    "model": {
                        "type": "string",
-                        "description": "(Optional) The content moderation model you would like to use."
+                        "description": "The content moderation model you would like to use."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "input"
+                    "input",
+                    "model"
                ],
                "title": "RunModerationRequest"
            },
@ -11844,7 +11877,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -12326,7 +12359,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -13379,7 +13412,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -14926,7 +14959,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -16671,7 +16704,7 @@
                        "enum": [
                            "model",
                            "shield",
-                            "vector_store",
+                            "vector_db",
                            "dataset",
                            "scoring_function",
                            "benchmark",
@ -17926,7 +17959,7 @@
        },
        {
            "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
            "x-displayName": "Inference"
        },
        {
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -350,46 +350,146 @@ paths:
          in: query
          description: >-
            An item ID to list items after, used in pagination.
-          required: false
+          required: true
          schema:
-            type: string
+            oneOf:
+              - type: string
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: include
          in: query
          description: >-
            Specify additional output data to include in the response.
-          required: false
+          required: true
          schema:
-            type: array
-            items:
-              type: string
-              enum:
-                - web_search_call.action.sources
-                - code_interpreter_call.outputs
-                - computer_call_output.output.image_url
-                - file_search_call.results
-                - message.input_image.image_url
-                - message.output_text.logprobs
-                - reasoning.encrypted_content
-              title: ConversationItemInclude
-              description: >-
-                Specify additional output data to include in the model response.
+            oneOf:
+              - type: array
+                items:
+                  type: string
+                  enum:
+                    - code_interpreter_call.outputs
+                    - computer_call_output.output.image_url
+                    - file_search_call.results
+                    - message.input_image.image_url
+                    - message.output_text.logprobs
+                    - reasoning.encrypted_content
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: limit
          in: query
          description: >-
            A limit on the number of objects to be returned (1-100, default 20).
-          required: false
+          required: true
          schema:
-            type: integer
+            oneOf:
+              - type: integer
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
        - name: order
          in: query
          description: >-
            The order to return items in (asc or desc, default desc).
-          required: false
+          required: true
          schema:
-            type: string
-            enum:
-              - asc
-              - desc
+            oneOf:
+              - type: string
+                enum:
+                  - asc
+                  - desc
+              - type: object
+                title: NotGiven
+                description: >-
+                  A sentinel singleton class used to distinguish omitted keyword arguments
+                  from those passed in with the value None (which may have different
+                  behavior).
+
+                  For example:
+
+
+                  ```py
+
+                  def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response:
+                  ...
+
+
+
+                  get(timeout=1)  # 1s timeout
+
+                  get(timeout=None)  # No timeout
+
+                  get()  # Default timeout behavior, which may not be statically known
+                  at the method definition.
+
+                  ```
      deprecated: false
    post:
      responses:
@ -6340,7 +6440,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -6382,7 +6482,6 @@ components:
      enum:
        - llm
        - embedding
-        - rerank
      title: ModelType
      description: >-
        Enumeration of supported model types in Llama Stack.
@ -6443,10 +6542,11 @@ components:
        model:
          type: string
          description: >-
-            (Optional) The content moderation model you would like to use.
+            The content moderation model you would like to use.
      additionalProperties: false
      required:
        - input
+        - model
      title: RunModerationRequest
    ModerationObject:
      type: object
@ -9032,7 +9132,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -9340,7 +9440,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -10103,7 +10203,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -11225,7 +11325,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -12552,7 +12652,7 @@ components:
          enum:
            - model
            - shield
-            - vector_store
+            - vector_db
            - dataset
            - scoring_function
            - benchmark
@ -13485,16 +13585,13 @@ tags:
      embeddings.


-      This API provides the raw interface to the underlying models. Three kinds of
-      models are supported:
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:

      - LLM models: these models generate "raw" and "chat" (conversational) completions.

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-
-      - Rerank models: these models reorder the documents based on their relevance
-      to a query.
    x-displayName: Inference
  - name: Inspect
    description: >-
--- a/llama_stack/apis/conversations/conversations.py
+++ b/llama_stack/apis/conversations/conversations.py
@ -4,9 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import StrEnum
 from typing import Annotated, Literal, Protocol, runtime_checkable

+from openai import NOT_GIVEN
+from openai._types import NotGiven
+from openai.types.responses.response_includable import ResponseIncludable
 from pydantic import BaseModel, Field

 from llama_stack.apis.agents.openai_responses import (
@ -21,7 +23,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 Metadata = dict[str, str]
@ -148,20 +150,6 @@ class ConversationItemCreateRequest(BaseModel):
    )


-class ConversationItemInclude(StrEnum):
-    """
-    Specify additional output data to include in the model response.
-    """
-
-    web_search_call_action_sources = "web_search_call.action.sources"
-    code_interpreter_call_outputs = "code_interpreter_call.outputs"
-    computer_call_output_output_image_url = "computer_call_output.output.image_url"
-    file_search_call_results = "file_search_call.results"
-    message_input_image_image_url = "message.input_image.image_url"
-    message_output_text_logprobs = "message.output_text.logprobs"
-    reasoning_encrypted_content = "reasoning.encrypted_content"
-
-
@json_schema_type
 class ConversationItemList(BaseModel):
    """List of conversation items with pagination."""
@ -262,13 +250,13 @@ class Conversations(Protocol):
        ...

    @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_items(
+    async def list(
        self,
        conversation_id: str,
-        after: str | None = None,
-        include: list[ConversationItemInclude] | None = None,
-        limit: int | None = None,
-        order: Literal["asc", "desc"] | None = None,
+        after: str | NotGiven = NOT_GIVEN,
+        include: list[ResponseIncludable] | NotGiven = NOT_GIVEN,
+        limit: int | NotGiven = NOT_GIVEN,
+        order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
    ) -> ConversationItemList:
        """List items.

--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -117,9 +117,11 @@ class Api(Enum, metaclass=DynamicApiMeta):
    post_training = "post_training"
    tool_runtime = "tool_runtime"

+    telemetry = "telemetry"
+
    models = "models"
    shields = "shields"
-    vector_stores = "vector_stores"  # only used for routing table
+    vector_dbs = "vector_dbs"  # only used for routing
    datasets = "datasets"
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -12,7 +12,7 @@ from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -23,7 +23,6 @@ from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -31,6 +30,7 @@ from llama_stack.models.llama.datatypes import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 register_schema(ToolCall)
@ -1234,10 +1234,9 @@ class Inference(InferenceProvider):

    Llama Stack Inference API for generating completions, chat completions, and embeddings.

-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.
    """

    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator

 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -27,12 +27,10 @@ class ModelType(StrEnum):
    """Enumeration of supported model types in Llama Stack.
    :cvar llm: Large language model for text generation and completion
    :cvar embedding: Embedding model for converting text to vector representations
-    :cvar rerank: Reranking model for reordering documents based on their relevance to a query
    """

    llm = "llm"
    embedding = "embedding"
-    rerank = "rerank"


@json_schema_type
--- a/llama_stack/apis/prompts/prompts.py
+++ b/llama_stack/apis/prompts/prompts.py
@ -11,7 +11,7 @@ from typing import Protocol, runtime_checkable
 from pydantic import BaseModel, Field, field_validator, model_validator

 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 class ResourceType(StrEnum):
    model = "model"
    shield = "shield"
-    vector_store = "vector_store"
+    vector_db = "vector_db"
    dataset = "dataset"
    scoring_function = "scoring_function"
    benchmark = "benchmark"
@ -34,4 +34,4 @@ class Resource(BaseModel):

    provider_id: str = Field(description="ID of the provider that owns this resource")

-    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")
+    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -123,13 +123,13 @@ class Safety(Protocol):

    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        """Create moderation.

        Classifies if text and/or image inputs are potentially harmful.
        :param input: Input (or inputs) to classify.
        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
-        :param model: (Optional) The content moderation model you would like to use.
+        :param model: The content moderation model you would like to use.
        :returns: A moderation object.
        """
        ...
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -10,7 +10,7 @@ from pydantic import BaseModel

 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -12,7 +12,7 @@ from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -13,7 +13,7 @@ from typing_extensions import runtime_checkable
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

 from .rag_tool import RAGToolRuntime
--- a/llama_stack/apis/vector_stores/init.py
+++ b/llama_stack/apis/vector_stores/init.py
@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from .vector_stores import *
+from .vector_dbs import *
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Literal, Protocol, runtime_checkable
+
+from pydantic import BaseModel
+
+from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class VectorDB(Resource):
+    """Vector database resource for storing and querying vector embeddings.
+
+    :param type: Type of resource, always 'vector_db' for vector databases
+    :param embedding_model: Name of the embedding model to use for vector generation
+    :param embedding_dimension: Dimension of the embedding vectors
+    """
+
+    type: Literal[ResourceType.vector_db] = ResourceType.vector_db
+
+    embedding_model: str
+    embedding_dimension: int
+    vector_db_name: str | None = None
+
+    @property
+    def vector_db_id(self) -> str:
+        return self.identifier
+
+    @property
+    def provider_vector_db_id(self) -> str | None:
+        return self.provider_resource_id
+
+
+class VectorDBInput(BaseModel):
+    """Input parameters for creating or configuring a vector database.
+
+    :param vector_db_id: Unique identifier for the vector database
+    :param embedding_model: Name of the embedding model to use for vector generation
+    :param embedding_dimension: Dimension of the embedding vectors
+    :param provider_vector_db_id: (Optional) Provider-specific identifier for the vector database
+    """
+
+    vector_db_id: str
+    embedding_model: str
+    embedding_dimension: int
+    provider_id: str | None = None
+    provider_vector_db_id: str | None = None
+
+
+class ListVectorDBsResponse(BaseModel):
+    """Response from listing vector databases.
+
+    :param data: List of vector databases
+    """
+
+    data: list[VectorDB]
+
+
+@runtime_checkable
+class VectorDBs(Protocol):
+    """Internal protocol for vector_dbs routing - no public API endpoints."""
+
+    async def list_vector_dbs(self) -> ListVectorDBsResponse:
+        """Internal method to list vector databases."""
+        ...
+
+    async def get_vector_db(
+        self,
+        vector_db_id: str,
+    ) -> VectorDB:
+        """Internal method to get a vector database by ID."""
+        ...
+
+    async def register_vector_db(
+        self,
+        vector_db_id: str,
+        embedding_model: str,
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        vector_db_name: str | None = None,
+        provider_vector_db_id: str | None = None,
+    ) -> VectorDB:
+        """Internal method to register a vector database."""
+        ...
+
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        """Internal method to unregister a vector database."""
+        ...
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -15,9 +15,9 @@ from fastapi import Body
 from pydantic import BaseModel, Field

 from llama_stack.apis.inference import InterleavedContent
-from llama_stack.apis.vector_stores import VectorStore
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema
@ -140,7 +140,6 @@ class VectorStoreFileCounts(BaseModel):
    total: int


-# TODO: rename this as OpenAIVectorStore
@json_schema_type
 class VectorStoreObject(BaseModel):
    """OpenAI Vector Store object.
@ -518,18 +517,17 @@ class OpenAICreateVectorStoreFileBatchRequestWithExtraBody(BaseModel, extra="all
    chunking_strategy: VectorStoreChunkingStrategy | None = None


-class VectorStoreTable(Protocol):
-    def get_vector_store(self, vector_store_id: str) -> VectorStore | None: ...
+class VectorDBStore(Protocol):
+    def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...


@runtime_checkable
@trace_protocol
 class VectorIO(Protocol):
-    vector_store_table: VectorStoreTable | None = None
+    vector_db_store: VectorDBStore | None = None

    # this will just block now until chunks are inserted, but it should
    # probably return a Job instance which can be polled for completion
-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
    async def insert_chunks(
        self,
@ -548,7 +546,6 @@ class VectorIO(Protocol):
        """
        ...

-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
    async def query_chunks(
        self,
--- a/llama_stack/apis/vector_stores/vector_stores.py
+++ b/llama_stack/apis/vector_stores/vector_stores.py
@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal
-
-from pydantic import BaseModel
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-# Internal resource type for storing the vector store routing and other information
-class VectorStore(Resource):
-    """Vector database resource for storing and querying vector embeddings.
-
-    :param type: Type of resource, always 'vector_store' for vector stores
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    """
-
-    type: Literal[ResourceType.vector_store] = ResourceType.vector_store
-
-    embedding_model: str
-    embedding_dimension: int
-    vector_store_name: str | None = None
-
-    @property
-    def vector_store_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_vector_store_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class VectorStoreInput(BaseModel):
-    """Input parameters for creating or configuring a vector database.
-
-    :param vector_store_id: Unique identifier for the vector store
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    :param provider_vector_store_id: (Optional) Provider-specific identifier for the vector store
-    """
-
-    vector_store_id: str
-    embedding_model: str
-    embedding_dimension: int
-    provider_id: str | None = None
-    provider_vector_store_id: str | None = None
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@ -6,8 +6,6 @@

 import argparse

-from llama_stack.log import setup_logging
-
 from .stack import StackParser
 from .stack.utils import print_subcommand_description

@ -44,9 +42,6 @@ class LlamaCLIParser:


 def main():
-    # Initialize logging from environment variables before any other operations
-    setup_logging()
-
    parser = LlamaCLIParser()
    args = parser.parse_args()
    parser.run(args)
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -0,0 +1,519 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import importlib.resources
+import json
+import os
+import shutil
+import sys
+import textwrap
+from functools import lru_cache
+from importlib.abc import Traversable
+from pathlib import Path
+
+import yaml
+from prompt_toolkit import prompt
+from prompt_toolkit.completion import WordCompleter
+from prompt_toolkit.validation import Validator
+from termcolor import colored, cprint
+
+from llama_stack.cli.stack.utils import ImageType
+from llama_stack.cli.table import print_table
+from llama_stack.core.build import (
+    SERVER_DEPENDENCIES,
+    build_image,
+    get_provider_dependencies,
+)
+from llama_stack.core.configure import parse_and_maybe_upgrade_config
+from llama_stack.core.datatypes import (
+    BuildConfig,
+    BuildProvider,
+    DistributionSpec,
+    Provider,
+    StackRunConfig,
+)
+from llama_stack.core.distribution import get_provider_registry
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.resolver import InvalidProviderError
+from llama_stack.core.stack import replace_env_vars
+from llama_stack.core.storage.datatypes import (
+    InferenceStoreReference,
+    KVStoreReference,
+    ServerStoresConfig,
+    SqliteKVStoreConfig,
+    SqliteSqlStoreConfig,
+    SqlStoreReference,
+    StorageConfig,
+)
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
+from llama_stack.core.utils.dynamic import instantiate_class_type
+from llama_stack.core.utils.exec import formulate_run_args, run_command
+from llama_stack.core.utils.image_types import LlamaStackImageType
+from llama_stack.providers.datatypes import Api
+
+DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"
+
+
+@lru_cache
+def available_distros_specs() -> dict[str, BuildConfig]:
+    import yaml
+
+    distro_specs = {}
+    for p in DISTRIBS_PATH.rglob("*build.yaml"):
+        distro_name = p.parent.name
+        with open(p) as f:
+            build_config = BuildConfig(**yaml.safe_load(f))
+            distro_specs[distro_name] = build_config
+    return distro_specs
+
+
+def run_stack_build_command(args: argparse.Namespace) -> None:
+    if args.list_distros:
+        return _run_distro_list_cmd()
+
+    if args.image_type == ImageType.VENV.value:
+        current_venv = os.environ.get("VIRTUAL_ENV")
+        image_name = args.image_name or current_venv
+    else:
+        image_name = args.image_name
+
+    if args.template:
+        cprint(
+            "The --template argument is deprecated. Please use --distro instead.",
+            color="red",
+            file=sys.stderr,
+        )
+        distro_name = args.template
+    else:
+        distro_name = args.distribution
+
+    if distro_name:
+        available_distros = available_distros_specs()
+        if distro_name not in available_distros:
+            cprint(
+                f"Could not find distribution {distro_name}. Please run `llama stack build --list-distros` to check out the available distributions",
+                color="red",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        build_config = available_distros[distro_name]
+        if args.image_type:
+            build_config.image_type = args.image_type
+        else:
+            cprint(
+                f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {distro_name}",
+                color="red",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+    elif args.providers:
+        provider_list: dict[str, list[BuildProvider]] = dict()
+        for api_provider in args.providers.split(","):
+            if "=" not in api_provider:
+                cprint(
+                    "Could not parse `--providers`. Please ensure the list is in the format api1=provider1,api2=provider2",
+                    color="red",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+            api, provider_type = api_provider.split("=")
+            providers_for_api = get_provider_registry().get(Api(api), None)
+            if providers_for_api is None:
+                cprint(
+                    f"{api} is not a valid API.",
+                    color="red",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+            if provider_type in providers_for_api:
+                provider = BuildProvider(
+                    provider_type=provider_type,
+                    module=None,
+                )
+                provider_list.setdefault(api, []).append(provider)
+            else:
+                cprint(
+                    f"{provider} is not a valid provider for the {api} API.",
+                    color="red",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+        distribution_spec = DistributionSpec(
+            providers=provider_list,
+            description=",".join(args.providers),
+        )
+        if not args.image_type:
+            cprint(
+                f"Please specify a image-type (container | venv) for {args.template}",
+                color="red",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        build_config = BuildConfig(image_type=args.image_type, distribution_spec=distribution_spec)
+    elif not args.config and not distro_name:
+        name = prompt(
+            "> Enter a name for your Llama Stack (e.g. my-local-stack): ",
+            validator=Validator.from_callable(
+                lambda x: len(x) > 0,
+                error_message="Name cannot be empty, please enter a name",
+            ),
+        )
+
+        image_type = prompt(
+            "> Enter the image type you want your Llama Stack to be built as (use <TAB> to see options): ",
+            completer=WordCompleter([e.value for e in ImageType]),
+            complete_while_typing=True,
+            validator=Validator.from_callable(
+                lambda x: x in [e.value for e in ImageType],
+                error_message="Invalid image type. Use <TAB> to see options",
+            ),
+        )
+
+        image_name = f"llamastack-{name}"
+
+        cprint(
+            textwrap.dedent(
+                """
+            Llama Stack is composed of several APIs working together. Let's select
+            the provider types (implementations) you want to use for these APIs.
+            """,
+            ),
+            color="green",
+            file=sys.stderr,
+        )
+
+        cprint("Tip: use <TAB> to see options for the providers.\n", color="green", file=sys.stderr)
+
+        providers: dict[str, list[BuildProvider]] = dict()
+        for api, providers_for_api in get_provider_registry().items():
+            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
+            if not available_providers:
+                continue
+            api_provider = prompt(
+                f"> Enter provider for API {api.value}: ",
+                completer=WordCompleter(available_providers),
+                complete_while_typing=True,
+                validator=Validator.from_callable(
+                    lambda x: x in available_providers,  # noqa: B023 - see https://github.com/astral-sh/ruff/issues/7847
+                    error_message="Invalid provider, use <TAB> to see options",
+                ),
+            )
+
+            string_providers = api_provider.split(" ")
+
+            for provider in string_providers:
+                providers.setdefault(api.value, []).append(BuildProvider(provider_type=provider))
+
+        description = prompt(
+            "\n > (Optional) Enter a short description for your Llama Stack: ",
+            default="",
+        )
+
+        distribution_spec = DistributionSpec(
+            providers=providers,
+            description=description,
+        )
+
+        build_config = BuildConfig(image_type=image_type, distribution_spec=distribution_spec)
+    else:
+        with open(args.config) as f:
+            try:
+                contents = yaml.safe_load(f)
+                contents = replace_env_vars(contents)
+                build_config = BuildConfig(**contents)
+                if args.image_type:
+                    build_config.image_type = args.image_type
+            except Exception as e:
+                cprint(
+                    f"Could not parse config file {args.config}: {e}",
+                    color="red",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+
+    if args.print_deps_only:
+        print(f"# Dependencies for {distro_name or args.config or image_name}")
+        normal_deps, special_deps, external_provider_dependencies = get_provider_dependencies(build_config)
+        normal_deps += SERVER_DEPENDENCIES
+        print(f"uv pip install {' '.join(normal_deps)}")
+        for special_dep in special_deps:
+            print(f"uv pip install {special_dep}")
+        for external_dep in external_provider_dependencies:
+            print(f"uv pip install {external_dep}")
+        return
+
+    try:
+        run_config = _run_stack_build_command_from_build_config(
+            build_config,
+            image_name=image_name,
+            config_path=args.config,
+            distro_name=distro_name,
+        )
+
+    except (Exception, RuntimeError) as exc:
+        import traceback
+
+        cprint(
+            f"Error building stack: {exc}",
+            color="red",
+            file=sys.stderr,
+        )
+        cprint("Stack trace:", color="red", file=sys.stderr)
+        traceback.print_exc()
+        sys.exit(1)
+
+    if run_config is None:
+        cprint(
+            "Run config path is empty",
+            color="red",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    if args.run:
+        config_dict = yaml.safe_load(run_config.read_text())
+        config = parse_and_maybe_upgrade_config(config_dict)
+        if config.external_providers_dir and not config.external_providers_dir.exists():
+            config.external_providers_dir.mkdir(exist_ok=True)
+        run_args = formulate_run_args(args.image_type, image_name or config.image_name)
+        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", str(run_config)])
+        run_command(run_args)
+
+
+def _generate_run_config(
+    build_config: BuildConfig,
+    build_dir: Path,
+    image_name: str,
+) -> Path:
+    """
+    Generate a run.yaml template file for user to edit from a build.yaml file
+    """
+    apis = list(build_config.distribution_spec.providers.keys())
+    distro_dir = DISTRIBS_BASE_DIR / image_name
+    storage = StorageConfig(
+        backends={
+            "kv_default": SqliteKVStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={distro_dir}}}/kvstore.db",
+            ),
+            "sql_default": SqliteSqlStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={distro_dir}}}/sql_store.db",
+            ),
+        },
+        stores=ServerStoresConfig(
+            metadata=KVStoreReference(
+                backend="kv_default",
+                namespace="registry",
+            ),
+            inference=InferenceStoreReference(
+                backend="sql_default",
+                table_name="inference_store",
+            ),
+            conversations=SqlStoreReference(
+                backend="sql_default",
+                table_name="openai_conversations",
+            ),
+        ),
+    )
+
+    run_config = StackRunConfig(
+        container_image=(image_name if build_config.image_type == LlamaStackImageType.CONTAINER.value else None),
+        image_name=image_name,
+        apis=apis,
+        providers={},
+        storage=storage,
+        external_providers_dir=build_config.external_providers_dir
+        if build_config.external_providers_dir
+        else EXTERNAL_PROVIDERS_DIR,
+    )
+    # build providers dict
+    provider_registry = get_provider_registry(build_config)
+    for api in apis:
+        run_config.providers[api] = []
+        providers = build_config.distribution_spec.providers[api]
+
+        for provider in providers:
+            pid = provider.provider_type.split("::")[-1]
+
+            p = provider_registry[Api(api)][provider.provider_type]
+            if p.deprecation_error:
+                raise InvalidProviderError(p.deprecation_error)
+
+            try:
+                config_type = instantiate_class_type(provider_registry[Api(api)][provider.provider_type].config_class)
+            except (ModuleNotFoundError, ValueError) as exc:
+                # HACK ALERT:
+                # This code executes after building is done, the import cannot work since the
+                # package is either available in the venv or container - not available on the host.
+                # TODO: use a "is_external" flag in ProviderSpec to check if the provider is
+                # external
+                cprint(
+                    f"Failed to import provider {provider.provider_type} for API {api} - assuming it's external, skipping: {exc}",
+                    color="yellow",
+                    file=sys.stderr,
+                )
+                # Set config_type to None to avoid UnboundLocalError
+                config_type = None
+
+            if config_type is not None and hasattr(config_type, "sample_run_config"):
+                config = config_type.sample_run_config(__distro_dir__=f"~/.llama/distributions/{image_name}")
+            else:
+                config = {}
+
+            p_spec = Provider(
+                provider_id=pid,
+                provider_type=provider.provider_type,
+                config=config,
+                module=provider.module,
+            )
+            run_config.providers[api].append(p_spec)
+
+    run_config_file = build_dir / f"{image_name}-run.yaml"
+
+    with open(run_config_file, "w") as f:
+        to_write = json.loads(run_config.model_dump_json())
+        f.write(yaml.dump(to_write, sort_keys=False))
+
+    # Only print this message for non-container builds since it will be displayed before the
+    # container is built
+    # For non-container builds, the run.yaml is generated at the very end of the build process so it
+    # makes sense to display this message
+    if build_config.image_type != LlamaStackImageType.CONTAINER.value:
+        cprint(f"You can now run your stack with `llama stack run {run_config_file}`", color="green", file=sys.stderr)
+    return run_config_file
+
+
+def _run_stack_build_command_from_build_config(
+    build_config: BuildConfig,
+    image_name: str | None = None,
+    distro_name: str | None = None,
+    config_path: str | None = None,
+) -> Path | Traversable:
+    image_name = image_name or build_config.image_name
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
+        if distro_name:
+            image_name = f"distribution-{distro_name}"
+        else:
+            if not image_name:
+                raise ValueError("Please specify an image name when building a container image without a template")
+    else:
+        if not image_name and os.environ.get("UV_SYSTEM_PYTHON"):
+            image_name = "__system__"
+        if not image_name:
+            raise ValueError("Please specify an image name when building a venv image")
+
+    # At this point, image_name should be guaranteed to be a string
+    if image_name is None:
+        raise ValueError("image_name should not be None after validation")
+
+    if distro_name:
+        build_dir = DISTRIBS_BASE_DIR / distro_name
+        build_file_path = build_dir / f"{distro_name}-build.yaml"
+    else:
+        if image_name is None:
+            raise ValueError("image_name cannot be None")
+        build_dir = DISTRIBS_BASE_DIR / image_name
+        build_file_path = build_dir / f"{image_name}-build.yaml"
+
+    os.makedirs(build_dir, exist_ok=True)
+    run_config_file = None
+    # Generate the run.yaml so it can be included in the container image with the proper entrypoint
+    # Only do this if we're building a container image and we're not using a template
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value and not distro_name and config_path:
+        cprint("Generating run.yaml file", color="yellow", file=sys.stderr)
+        run_config_file = _generate_run_config(build_config, build_dir, image_name)
+
+    with open(build_file_path, "w") as f:
+        to_write = json.loads(build_config.model_dump_json(exclude_none=True))
+        f.write(yaml.dump(to_write, sort_keys=False))
+
+    # We first install the external APIs so that the build process can use them and discover the
+    # providers dependencies
+    if build_config.external_apis_dir:
+        cprint("Installing external APIs", color="yellow", file=sys.stderr)
+        external_apis = load_external_apis(build_config)
+        if external_apis:
+            # install the external APIs
+            packages = []
+            for _, api_spec in external_apis.items():
+                if api_spec.pip_packages:
+                    packages.extend(api_spec.pip_packages)
+                    cprint(
+                        f"Installing {api_spec.name} with pip packages {api_spec.pip_packages}",
+                        color="yellow",
+                        file=sys.stderr,
+                    )
+            return_code = run_command(["uv", "pip", "install", *packages])
+            if return_code != 0:
+                packages_str = ", ".join(packages)
+                raise RuntimeError(
+                    f"Failed to install external APIs packages: {packages_str} (return code: {return_code})"
+                )
+
+    return_code = build_image(
+        build_config,
+        image_name,
+        distro_or_config=distro_name or config_path or str(build_file_path),
+        run_config=run_config_file.as_posix() if run_config_file else None,
+    )
+    if return_code != 0:
+        raise RuntimeError(f"Failed to build image {image_name}")
+
+    if distro_name:
+        # copy run.yaml from distribution to build_dir instead of generating it again
+        distro_path = importlib.resources.files("llama_stack") / f"distributions/{distro_name}/run.yaml"
+        run_config_file = build_dir / f"{distro_name}-run.yaml"
+
+        with importlib.resources.as_file(distro_path) as path:
+            shutil.copy(path, run_config_file)
+
+        cprint("Build Successful!", color="green", file=sys.stderr)
+        cprint(f"You can find the newly-built distribution here: {run_config_file}", color="blue", file=sys.stderr)
+        if build_config.image_type == LlamaStackImageType.VENV:
+            cprint(
+                "You can run the new Llama Stack distro (after activating "
+                + colored(image_name, "cyan")
+                + ") via: "
+                + colored(f"llama stack run {run_config_file}", "blue"),
+                color="green",
+                file=sys.stderr,
+            )
+        elif build_config.image_type == LlamaStackImageType.CONTAINER:
+            cprint(
+                "You can run the container with: "
+                + colored(
+                    f"docker run -p 8321:8321 -v ~/.llama:/root/.llama localhost/{image_name} --port 8321", "blue"
+                ),
+                color="green",
+                file=sys.stderr,
+            )
+        return distro_path
+    else:
+        return _generate_run_config(build_config, build_dir, image_name)
+
+
+def _run_distro_list_cmd() -> None:
+    headers = [
+        "Distribution Name",
+        # "Providers",
+        "Description",
+    ]
+
+    rows = []
+    for distro_name, spec in available_distros_specs().items():
+        rows.append(
+            [
+                distro_name,
+                # json.dumps(spec.distribution_spec.providers, indent=2),
+                spec.distribution_spec.description,
+            ]
+        )
+    print_table(
+        rows,
+        headers,
+        separate_rows=True,
+    )
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import argparse
+import textwrap
+
+from llama_stack.cli.stack.utils import ImageType
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.log import get_logger
+
+logger = get_logger(__name__, category="cli")
+
+
+class StackBuild(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "build",
+            prog="llama stack build",
+            description="[DEPRECATED] Build a Llama stack container. This command is deprecated and will be removed in a future release. Use `llama stack list-deps <distro>' instead.",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_stack_build_command)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "--config",
+            type=str,
+            default=None,
+            help="Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to enter information interactively",
+        )
+
+        self.parser.add_argument(
+            "--template",
+            type=str,
+            default=None,
+            help="""(deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions""",
+        )
+        self.parser.add_argument(
+            "--distro",
+            "--distribution",
+            dest="distribution",
+            type=str,
+            default=None,
+            help="""Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions""",
+        )
+
+        self.parser.add_argument(
+            "--list-distros",
+            "--list-distributions",
+            action="store_true",
+            dest="list_distros",
+            default=False,
+            help="Show the available distributions for building a Llama Stack distribution",
+        )
+
+        self.parser.add_argument(
+            "--image-type",
+            type=str,
+            help="Image Type to use for the build. If not specified, will use the image type from the template config.",
+            choices=[e.value for e in ImageType],
+            default=None,  # no default so we can detect if a user specified --image-type and override image_type in the config
+        )
+
+        self.parser.add_argument(
+            "--image-name",
+            type=str,
+            help=textwrap.dedent(
+                f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the virtual environment to use for
+the build. If not specified, currently active environment will be used if found.
+            """
+            ),
+            default=None,
+        )
+        self.parser.add_argument(
+            "--print-deps-only",
+            default=False,
+            action="store_true",
+            help="Print the dependencies for the stack only, without building the stack",
+        )
+
+        self.parser.add_argument(
+            "--run",
+            action="store_true",
+            default=False,
+            help="Run the stack after building using the same image type, name, and other applicable arguments",
+        )
+        self.parser.add_argument(
+            "--providers",
+            type=str,
+            default=None,
+            help="Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per API.",
+        )
+
+    def _run_stack_build_command(self, args: argparse.Namespace) -> None:
+        logger.warning(
+            "The 'llama stack build' command is deprecated and will be removed in a future release. Please use 'llama stack list-deps'"
+        )
+        # always keep implementation completely silo-ed away from CLI so CLI
+        # can be fast to load and reduces dependencies
+        from ._build import run_stack_build_command
+
+        return run_stack_build_command(args)
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -15,10 +15,10 @@ import yaml

 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
-from llama_stack.core.datatypes import StackRunConfig
+from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
 from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
-from llama_stack.log import LoggingConfig, get_logger
+from llama_stack.log import get_logger

 REPO_ROOT = Path(__file__).parent.parent.parent.parent

--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@ -11,6 +11,7 @@ from llama_stack.cli.stack.list_stacks import StackListBuilds
 from llama_stack.cli.stack.utils import print_subcommand_description
 from llama_stack.cli.subcommand import Subcommand

+from .build import StackBuild
 from .list_apis import StackListApis
 from .list_deps import StackListDeps
 from .list_providers import StackListProviders
@ -40,6 +41,7 @@ class StackParser(Subcommand):

        # Add sub-commands
        StackListDeps.create(subparsers)
+        StackBuild.create(subparsers)
        StackListApis.create(subparsers)
        StackListProviders.create(subparsers)
        StackRun.create(subparsers)
--- a/llama_stack/core/access_control/datatypes.py
+++ b/llama_stack/core/access_control/datatypes.py
@ -41,7 +41,7 @@ class AccessRule(BaseModel):
    A rule defines a list of action either to permit or to forbid. It may specify a
    principal or a resource that must match for the rule to take effect. The resource
    to match should be specified in the form of a type qualified identifier, e.g.
-    model::my-model or vector_store::some-db, or a wildcard for all resources of a type,
+    model::my-model or vector_db::some-db, or a wildcard for all resources of a type,
    e.g. model::*. If the principal or resource are not specified, they will match all
    requests.

@ -79,9 +79,9 @@ class AccessRule(BaseModel):
      description: any user has read access to any resource created by a member of their team
    - forbid:
        actions: [create, read, delete]
-        resource: vector_store::*
+        resource: vector_db::*
      unless: user with admin in roles
-      description: only user with admin role can use vector_store resources
+      description: only user with admin role can use vector_db resources

    """

--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -0,0 +1,410 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
+LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
+
+TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+PYPI_VERSION=${PYPI_VERSION:-}
+BUILD_PLATFORM=${BUILD_PLATFORM:-}
+# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
+
+# mounting is not supported by docker buildx, so we use COPY instead
+USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}
+# Path to the run.yaml file in the container
+RUN_CONFIG_PATH=/app/run.yaml
+
+BUILD_CONTEXT_DIR=$(pwd)
+
+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Usage function
+usage() {
+  echo "Usage: $0 --image-name <image_name> --container-base <container_base> --normal-deps <pip_dependencies> [--run-config <run_config>] [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --image-name llama-stack-img --container-base python:3.12-slim --normal-deps 'numpy pandas' --run-config ./run.yaml --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+image_name=""
+container_base=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+run_config=""
+distro_or_config=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --image-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --image-name requires a string value" >&2
+        usage
+      fi
+      image_name="$2"
+      shift 2
+      ;;
+    --container-base)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --container-base requires a string value" >&2
+        usage
+      fi
+      container_base="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    --run-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --run-config requires a string value" >&2
+        usage
+      fi
+      run_config="$2"
+      shift 2
+      ;;
+    --distro-or-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --distro-or-config requires a string value" >&2
+        usage
+      fi
+      distro_or_config="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$image_name" || -z "$container_base" || -z "$normal_deps" ]]; then
+  echo "Error: --image-name, --container-base, and --normal-deps are required." >&2
+  usage
+fi
+
+CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
+CONTAINER_OPTS=${CONTAINER_OPTS:---progress=plain}
+TEMP_DIR=$(mktemp -d)
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+add_to_container() {
+  output_file="$TEMP_DIR/Containerfile"
+  if [ -t 0 ]; then
+    printf '%s\n' "$1" >>"$output_file"
+  else
+    cat >>"$output_file"
+  fi
+}
+
+if ! is_command_available "$CONTAINER_BINARY"; then
+  printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
+  exit 1
+fi
+
+if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
+  add_to_container << EOF
+FROM $container_base
+WORKDIR /app
+
+# We install the Python 3.12 dev headers and build tools so that any
+# C-extension wheels (e.g. polyleven, faiss-cpu) can compile successfully.
+
+RUN dnf -y update && dnf install -y iputils git net-tools wget \
+    vim-minimal python3.12 python3.12-pip python3.12-wheel \
+    python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
+    ln -s /bin/pip3.12 /bin/pip && ln -s /bin/python3.12 /bin/python && dnf clean all
+
+ENV UV_SYSTEM_PYTHON=1
+RUN pip install uv
+EOF
+else
+  add_to_container << EOF
+FROM $container_base
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+       iputils-ping net-tools iproute2 dnsutils telnet \
+       curl wget telnet git\
+       procps psmisc lsof \
+       traceroute \
+       bubblewrap \
+       gcc g++ \
+       && rm -rf /var/lib/apt/lists/*
+
+ENV UV_SYSTEM_PYTHON=1
+RUN pip install uv
+EOF
+fi
+
+# Add pip dependencies first since llama-stack is what will change most often
+# so we can reuse layers.
+if [ -n "$normal_deps" ]; then
+  read -ra pip_args <<<  "$normal_deps"
+  quoted_deps=$(printf " %q" "${pip_args[@]}")
+  add_to_container << EOF
+RUN uv pip install --no-cache $quoted_deps
+EOF
+fi
+
+if [ -n "$optional_deps" ]; then
+  IFS='#' read -ra parts <<<"$optional_deps"
+  for part in "${parts[@]}"; do
+    read -ra pip_args <<< "$part"
+    quoted_deps=$(printf " %q" "${pip_args[@]}")
+    add_to_container <<EOF
+RUN uv pip install --no-cache $quoted_deps
+EOF
+  done
+fi
+
+if [ -n "$external_provider_deps" ]; then
+  IFS='#' read -ra parts <<<"$external_provider_deps"
+  for part in "${parts[@]}"; do
+    read -ra pip_args <<< "$part"
+    quoted_deps=$(printf " %q" "${pip_args[@]}")
+    add_to_container <<EOF
+RUN uv pip install --no-cache $quoted_deps
+EOF
+    add_to_container <<EOF
+RUN python3 - <<PYTHON | uv pip install --no-cache -r -
+import importlib
+import sys
+
+try:
+    package_name = '$part'.split('==')[0].split('>=')[0].split('<=')[0].split('!=')[0].split('<')[0].split('>')[0]
+    module = importlib.import_module(f'{package_name}.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        if isinstance(spec.pip_packages, (list, tuple)):
+            print('\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for {package_name}: {e}', file=sys.stderr)
+PYTHON
+EOF
+  done
+fi
+
+get_python_cmd() {
+    if is_command_available python; then
+        echo "python"
+    elif is_command_available python3; then
+        echo "python3"
+    else
+        echo "Error: Neither python nor python3 is installed. Please install Python to continue." >&2
+        exit 1
+    fi
+}
+
+if [ -n "$run_config" ]; then
+  # Copy the run config to the build context since it's an absolute path
+  cp "$run_config" "$BUILD_CONTEXT_DIR/run.yaml"
+
+  # Parse the run.yaml configuration to identify external provider directories
+  # If external providers are specified, copy their directory to the container
+  # and update the configuration to reference the new container path
+  python_cmd=$(get_python_cmd)
+  external_providers_dir=$($python_cmd -c "import yaml; config = yaml.safe_load(open('$run_config')); print(config.get('external_providers_dir') or '')")
+  external_providers_dir=$(eval echo "$external_providers_dir")
+  if [ -n "$external_providers_dir" ]; then
+    if [ -d "$external_providers_dir" ]; then
+    echo "Copying external providers directory: $external_providers_dir"
+    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
+    add_to_container << EOF
+COPY providers.d /.llama/providers.d
+EOF
+    fi
+
+    # Edit the run.yaml file to change the external_providers_dir to /.llama/providers.d
+    if [ "$(uname)" = "Darwin" ]; then
+      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      rm -f "$BUILD_CONTEXT_DIR/run.yaml.bak"
+    else
+      sed -i 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+    fi
+  fi
+
+  # Copy run config into docker image
+  add_to_container << EOF
+COPY run.yaml $RUN_CONFIG_PATH
+EOF
+fi
+
+stack_mount="/app/llama-stack-source"
+client_mount="/app/llama-stack-client-source"
+
+install_local_package() {
+  local dir="$1"
+  local mount_point="$2"
+  local name="$3"
+
+  if [ ! -d "$dir" ]; then
+    echo "${RED}Warning: $name is set but directory does not exist: $dir${NC}" >&2
+    exit 1
+  fi
+
+  if [ "$USE_COPY_NOT_MOUNT" = "true" ]; then
+    add_to_container << EOF
+COPY $dir $mount_point
+EOF
+  fi
+  add_to_container << EOF
+RUN uv pip install --no-cache -e $mount_point
+EOF
+}
+
+
+if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+  install_local_package "$LLAMA_STACK_CLIENT_DIR" "$client_mount" "LLAMA_STACK_CLIENT_DIR"
+fi
+
+if [ -n "$LLAMA_STACK_DIR" ]; then
+  install_local_package "$LLAMA_STACK_DIR" "$stack_mount" "LLAMA_STACK_DIR"
+else
+  if [ -n "$TEST_PYPI_VERSION" ]; then
+    # these packages are damaged in test-pypi, so install them first
+    add_to_container << EOF
+RUN uv pip install --no-cache fastapi libcst
+EOF
+    add_to_container << EOF
+RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
+  --index-strategy unsafe-best-match \
+  llama-stack==$TEST_PYPI_VERSION
+
+EOF
+  else
+    if [ -n "$PYPI_VERSION" ]; then
+      SPEC_VERSION="llama-stack==${PYPI_VERSION}"
+    else
+      SPEC_VERSION="llama-stack"
+    fi
+    add_to_container << EOF
+RUN uv pip install --no-cache $SPEC_VERSION
+EOF
+  fi
+fi
+
+# remove uv after installation
+  add_to_container << EOF
+RUN pip uninstall -y uv
+EOF
+
+# If a run config is provided, we use the llama stack CLI
+if [[ -n "$run_config" ]]; then
+  add_to_container << EOF
+ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
+EOF
+elif [[ "$distro_or_config" != *.yaml ]]; then
+  add_to_container << EOF
+ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
+EOF
+fi
+
+# Add other require item commands genearic to all containers
+add_to_container << EOF
+
+RUN mkdir -p /.llama /.cache && chmod -R g+rw /.llama /.cache && (chmod -R g+rw /app 2>/dev/null || true)
+EOF
+
+printf "Containerfile created successfully in %s/Containerfile\n\n" "$TEMP_DIR"
+cat "$TEMP_DIR"/Containerfile
+printf "\n"
+
+# Start building the CLI arguments
+CLI_ARGS=()
+
+# Read CONTAINER_OPTS and put it in an array
+read -ra CLI_ARGS <<< "$CONTAINER_OPTS"
+
+if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then
+  if [ -n "$LLAMA_STACK_DIR" ]; then
+    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_DIR"):$stack_mount")
+  fi
+  if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_CLIENT_DIR"):$client_mount")
+  fi
+fi
+
+if is_command_available selinuxenabled && selinuxenabled; then
+  # Disable SELinux labels -- we don't want to relabel the llama-stack source dir
+  CLI_ARGS+=("--security-opt" "label=disable")
+fi
+
+# Set version tag based on PyPI version
+if [ -n "$PYPI_VERSION" ]; then
+  version_tag="$PYPI_VERSION"
+elif [ -n "$TEST_PYPI_VERSION" ]; then
+  version_tag="test-$TEST_PYPI_VERSION"
+elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_STACK_CLIENT_DIR" ]]; then
+  version_tag="dev"
+else
+  URL="https://pypi.org/pypi/llama-stack/json"
+  version_tag=$(curl -s $URL | jq -r '.info.version')
+fi
+
+# Add version tag to image name
+image_tag="$image_name:$version_tag"
+
+# Detect platform architecture
+ARCH=$(uname -m)
+if [ -n "$BUILD_PLATFORM" ]; then
+  CLI_ARGS+=("--platform" "$BUILD_PLATFORM")
+elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
+  CLI_ARGS+=("--platform" "linux/arm64")
+elif [ "$ARCH" = "x86_64" ]; then
+  CLI_ARGS+=("--platform" "linux/amd64")
+else
+  echo "Unsupported architecture: $ARCH"
+  exit 1
+fi
+
+echo "PWD: $(pwd)"
+echo "Containerfile: $TEMP_DIR/Containerfile"
+set -x
+
+$CONTAINER_BINARY build \
+  "${CLI_ARGS[@]}" \
+  -t "$image_tag" \
+  -f "$TEMP_DIR/Containerfile" \
+  "$BUILD_CONTEXT_DIR"
+
+# clean up tmp/configs
+rm -rf "$BUILD_CONTEXT_DIR/run.yaml" "$TEMP_DIR"
+set +x
+
+echo "Success!"
--- a/llama_stack/core/build_venv.sh
+++ b/llama_stack/core/build_venv.sh
@ -0,0 +1,220 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
+LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
+TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
+UV_SYSTEM_PYTHON=${UV_SYSTEM_PYTHON:-}
+VIRTUAL_ENV=${VIRTUAL_ENV:-}
+
+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <env_name> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name mybuild --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name and --normal-deps are required." >&2
+  usage
+fi
+
+if [ -n "$LLAMA_STACK_DIR" ]; then
+  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
+fi
+if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
+fi
+
+ENVNAME=""
+
+# pre-run checks to make sure we can proceed with the installation
+pre_run_checks() {
+  local env_name="$1"
+
+  if ! is_command_available uv; then
+    echo "uv is not installed, trying to install it."
+    if ! is_command_available pip; then
+      echo "pip is not installed, cannot automatically install 'uv'."
+      echo "Follow this link to install it:"
+      echo "https://docs.astral.sh/uv/getting-started/installation/"
+      exit 1
+    else
+      pip install uv
+    fi
+  fi
+
+  # checking if an environment with the same name already exists
+  if [ -d "$env_name" ]; then
+    echo "Environment '$env_name' already exists, re-using it."
+  fi
+}
+
+run() {
+  # Use only global variables set by flag parser
+  if [ -n "$UV_SYSTEM_PYTHON" ] || [ "$env_name" == "__system__" ]; then
+    echo "Installing dependencies in system Python environment"
+    export UV_SYSTEM_PYTHON=1
+  elif [ "$VIRTUAL_ENV" == "$env_name" ]; then
+    echo "Virtual environment $env_name is already active"
+  else
+    echo "Using virtual environment $env_name"
+    uv venv "$env_name"
+    source "$env_name/bin/activate"
+  fi
+
+  if [ -n "$TEST_PYPI_VERSION" ]; then
+    uv pip install fastapi libcst
+    uv pip install --extra-index-url https://test.pypi.org/simple/ \
+      --index-strategy unsafe-best-match \
+      llama-stack=="$TEST_PYPI_VERSION" \
+      $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install "$part"
+      done
+    fi
+  else
+    if [ -n "$LLAMA_STACK_DIR" ]; then
+      # only warn if DIR does not start with "git+"
+      if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
+        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
+        exit 1
+      fi
+      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
+      # editable only if LLAMA_STACK_DIR does not start with "git+"
+      if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
+        EDITABLE="-e"
+      else
+        EDITABLE=""
+      fi
+      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
+    else
+      uv pip install --no-cache-dir llama-stack
+    fi
+
+    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+      # only warn if DIR does not start with "git+"
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
+        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
+        exit 1
+      fi
+      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
+      # editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
+      if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
+        EDITABLE="-e"
+      else
+        EDITABLE=""
+      fi
+      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
+    fi
+
+    printf "Installing pip dependencies\n"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "Installing special provider module: $part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Installing external provider module: $part"
+        uv pip install "$part"
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
+  fi
+}
+
+pre_run_checks "$env_name"
+run
--- a/llama_stack/core/conversations/conversations.py
+++ b/llama_stack/core/conversations/conversations.py
@ -6,8 +6,9 @@

 import secrets
 import time
-from typing import Any, Literal
+from typing import Any

+from openai import NOT_GIVEN
 from pydantic import BaseModel, TypeAdapter

 from llama_stack.apis.conversations.conversations import (
@ -15,7 +16,6 @@ from llama_stack.apis.conversations.conversations import (
    ConversationDeletedResource,
    ConversationItem,
    ConversationItemDeletedResource,
-    ConversationItemInclude,
    ConversationItemList,
    Conversations,
    Metadata,
@ -247,14 +247,7 @@ class ConversationServiceImpl(Conversations):
        adapter: TypeAdapter[ConversationItem] = TypeAdapter(ConversationItem)
        return adapter.validate_python(record["item_data"])

-    async def list_items(
-        self,
-        conversation_id: str,
-        after: str | None = None,
-        include: list[ConversationItemInclude] | None = None,
-        limit: int | None = None,
-        order: Literal["asc", "desc"] | None = None,
-    ) -> ConversationItemList:
+    async def list(self, conversation_id: str, after=NOT_GIVEN, include=NOT_GIVEN, limit=NOT_GIVEN, order=NOT_GIVEN):
        """List items in the conversation."""
        if not conversation_id:
            raise ValueError(f"Expected a non-empty value for `conversation_id` but received {conversation_id!r}")
@ -265,12 +258,14 @@ class ConversationServiceImpl(Conversations):
        result = await self.sql_store.fetch_all(table="conversation_items", where={"conversation_id": conversation_id})
        records = result.data

-        if order is not None and order == "asc":
+        if order != NOT_GIVEN and order == "asc":
            records.sort(key=lambda x: x["created_at"])
        else:
            records.sort(key=lambda x: x["created_at"], reverse=True)

-        actual_limit = limit or 20
+        actual_limit = 20
+        if limit != NOT_GIVEN and isinstance(limit, int):
+            actual_limit = limit

        records = records[:actual_limit]
        items = [record["item_data"] for record in records]
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -23,15 +23,14 @@ from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
 from llama_stack.apis.shields import Shield, ShieldInput
 from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
+from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore, VectorStoreInput
 from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.core.storage.datatypes import (
    KVStoreReference,
    StorageBackendType,
    StorageConfig,
 )
-from llama_stack.log import LoggingConfig
 from llama_stack.providers.datatypes import Api, ProviderSpec

 LLAMA_STACK_BUILD_CONFIG_VERSION = 2
@ -72,7 +71,7 @@ class ShieldWithOwner(Shield, ResourceWithOwner):
    pass


-class VectorStoreWithOwner(VectorStore, ResourceWithOwner):
+class VectorDBWithOwner(VectorDB, ResourceWithOwner):
    pass


@ -92,12 +91,12 @@ class ToolGroupWithOwner(ToolGroup, ResourceWithOwner):
    pass


-RoutableObject = Model | Shield | VectorStore | Dataset | ScoringFn | Benchmark | ToolGroup
+RoutableObject = Model | Shield | VectorDB | Dataset | ScoringFn | Benchmark | ToolGroup

 RoutableObjectWithProvider = Annotated[
    ModelWithOwner
    | ShieldWithOwner
-    | VectorStoreWithOwner
+    | VectorDBWithOwner
    | DatasetWithOwner
    | ScoringFnWithOwner
    | BenchmarkWithOwner
@ -196,6 +195,14 @@ class TelemetryConfig(BaseModel):
    enabled: bool = Field(default=False, description="enable or disable telemetry")


+class LoggingConfig(BaseModel):
+    category_levels: dict[str, str] = Field(
+        default_factory=dict,
+        description="""
+ Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
+    )
+
+
 class OAuth2JWKSConfig(BaseModel):
    # The JWKS URI for collecting public keys
    uri: str
@ -367,15 +374,6 @@ class VectorStoresConfig(BaseModel):
    )


-class SafetyConfig(BaseModel):
-    """Configuration for default moderations model."""
-
-    default_shield_id: str | None = Field(
-        default=None,
-        description="ID of the shield to use for when `model` is not specified in the `moderations` API request.",
-    )
-
-
 class QuotaPeriod(StrEnum):
    DAY = "day"

@ -429,7 +427,7 @@ class RegisteredResources(BaseModel):

    models: list[ModelInput] = Field(default_factory=list)
    shields: list[ShieldInput] = Field(default_factory=list)
-    vector_stores: list[VectorStoreInput] = Field(default_factory=list)
+    vector_dbs: list[VectorDBInput] = Field(default_factory=list)
    datasets: list[DatasetInput] = Field(default_factory=list)
    scoring_fns: list[ScoringFnInput] = Field(default_factory=list)
    benchmarks: list[BenchmarkInput] = Field(default_factory=list)
@ -534,11 +532,6 @@ can be instantiated multiple times (with different configs) if necessary.
        description="Configuration for vector stores, including default embedding model",
    )

-    safety: SafetyConfig | None = Field(
-        default=None,
-        description="Configuration for default moderations model",
-    )
-
    @field_validator("external_providers_dir")
    @classmethod
    def validate_external_providers_dir(cls, v):
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -25,7 +25,7 @@ from llama_stack.providers.datatypes import (
 logger = get_logger(name=__name__, category="core")


-INTERNAL_APIS = {Api.inspect, Api.providers, Api.prompts, Api.conversations}
+INTERNAL_APIS = {Api.inspect, Api.providers, Api.prompts, Api.conversations, Api.telemetry}


 def stack_apis() -> list[Api]:
@ -64,7 +64,7 @@ def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
            router_api=Api.tool_runtime,
        ),
        AutoRoutedApiInfo(
-            routing_table_api=Api.vector_stores,
+            routing_table_api=Api.vector_dbs,
            router_api=Api.vector_io,
        ),
    ]
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -32,7 +32,7 @@ from termcolor import cprint

 from llama_stack.core.build import print_pip_install_help
 from llama_stack.core.configure import parse_and_maybe_upgrade_config
-from llama_stack.core.datatypes import BuildConfig, BuildProvider, DistributionSpec
+from llama_stack.core.datatypes import Api, BuildConfig, BuildProvider, DistributionSpec
 from llama_stack.core.request_headers import (
    PROVIDER_DATA_VAR,
    request_provider_data_context,
@ -44,12 +44,11 @@ from llama_stack.core.stack import (
    get_stack_run_config_from_distro,
    replace_env_vars,
 )
-from llama_stack.core.telemetry import Telemetry
-from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
-from llama_stack.log import get_logger, setup_logging
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.telemetry.tracing import CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace
 from llama_stack.strong_typing.inspection import is_unwrapped_body_param

 logger = get_logger(name=__name__, category="core")
@ -201,9 +200,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        skip_logger_removal: bool = False,
    ):
        super().__init__()
-        # Initialize logging from environment variables first
-        setup_logging()
-
        # when using the library client, we should not log to console since many
        # of our logs are intended for server-side usage
        if sinks_from_env := os.environ.get("TELEMETRY_SINKS", None):
@ -282,7 +278,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
            else:
                prefix = "!" if in_notebook() else ""
                cprint(
-                    f"Please run:\n\n{prefix}llama stack list-deps {self.config_path_or_distro_name} | xargs -L1 uv pip install\n\n",
+                    f"Please run:\n\n{prefix}llama stack build --distro {self.config_path_or_distro_name} --image-type venv\n\n",
                    "yellow",
                    file=sys.stderr,
                )
@ -294,8 +290,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
            raise _e

        assert self.impls is not None
-        if self.config.telemetry.enabled:
-            setup_logger(Telemetry())
+        if Api.telemetry in self.impls:
+            setup_logger(self.impls[Api.telemetry])

        if not os.environ.get("PYTEST_CURRENT_TEST"):
            console = Console()
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -27,9 +27,10 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
+from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
+from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.client import get_client_impl
 from llama_stack.core.datatypes import (
@ -48,6 +49,7 @@ from llama_stack.providers.datatypes import (
    Api,
    BenchmarksProtocolPrivate,
    DatasetsProtocolPrivate,
+    InlineProviderSpec,
    ModelsProtocolPrivate,
    ProviderSpec,
    RemoteProviderConfig,
@ -80,7 +82,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.inspect: Inspect,
        Api.batches: Batches,
        Api.vector_io: VectorIO,
-        Api.vector_stores: VectorStore,
+        Api.vector_dbs: VectorDBs,
        Api.models: Models,
        Api.safety: Safety,
        Api.shields: Shields,
@ -96,6 +98,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.files: Files,
        Api.prompts: Prompts,
        Api.conversations: Conversations,
+        Api.telemetry: Telemetry,
    }

    if external_apis:
@ -238,6 +241,24 @@ def validate_and_prepare_providers(
        key = api_str if api not in router_apis else f"inner-{api_str}"
        providers_with_specs[key] = specs

+    # TODO: remove this logic, telemetry should not have providers.
+    # if telemetry has been enabled in the config initialize our internal impl
+    # telemetry is not an external API so it SHOULD NOT be auto-routed.
+    if run_config.telemetry.enabled:
+        specs = {}
+        p = InlineProviderSpec(
+            api=Api.telemetry,
+            provider_type="inline::meta-reference",
+            pip_packages=[],
+            optional_api_dependencies=[Api.datasetio],
+            module="llama_stack.providers.inline.telemetry.meta_reference",
+            config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
+            description="Meta's reference implementation of telemetry and observability using OpenTelemetry.",
+        )
+        spec = ProviderWithSpec(spec=p, provider_type="inline::meta-reference", provider_id="meta-reference")
+        specs["meta-reference"] = spec
+        providers_with_specs["telemetry"] = specs
+
    return providers_with_specs


--- a/llama_stack/core/routers/init.py
+++ b/llama_stack/core/routers/init.py
@ -29,7 +29,7 @@ async def get_routing_table_impl(
    from ..routing_tables.scoring_functions import ScoringFunctionsRoutingTable
    from ..routing_tables.shields import ShieldsRoutingTable
    from ..routing_tables.toolgroups import ToolGroupsRoutingTable
-    from ..routing_tables.vector_stores import VectorStoresRoutingTable
+    from ..routing_tables.vector_dbs import VectorDBsRoutingTable

    api_to_tables = {
        "models": ModelsRoutingTable,
@ -38,7 +38,7 @@ async def get_routing_table_impl(
        "scoring_functions": ScoringFunctionsRoutingTable,
        "benchmarks": BenchmarksRoutingTable,
        "tool_groups": ToolGroupsRoutingTable,
-        "vector_stores": VectorStoresRoutingTable,
+        "vector_dbs": VectorDBsRoutingTable,
    }

    if api.value not in api_to_tables:
@ -72,6 +72,14 @@ async def get_auto_router_impl(
        raise ValueError(f"API {api.value} not found in router map")

    api_to_dep_impl = {}
+    if run_config.telemetry.enabled:
+        api_to_deps = {
+            "inference": {"telemetry": Api.telemetry},
+        }
+        for dep_name, dep_api in api_to_deps.get(api.value, {}).items():
+            if dep_api in deps:
+                api_to_dep_impl[dep_name] = deps[dep_api]
+
    # TODO: move pass configs to routers instead
    if api == Api.inference:
        inference_ref = run_config.storage.stores.inference
@ -84,12 +92,9 @@ async def get_auto_router_impl(
        )
        await inference_store.initialize()
        api_to_dep_impl["store"] = inference_store
-        api_to_dep_impl["telemetry_enabled"] = run_config.telemetry.enabled

    elif api == Api.vector_io:
        api_to_dep_impl["vector_stores_config"] = run_config.vector_stores
-    elif api == Api.safety:
-        api_to_dep_impl["safety_config"] = run_config.safety

    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
    await impl.initialize()
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -44,22 +44,17 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsResponse,
    OpenAIMessageParam,
    Order,
-    RerankResponse,
    StopReason,
    ToolPromptFormat,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-)
 from llama_stack.apis.models import Model, ModelType
-from llama_stack.apis.telemetry import MetricEvent, MetricInResponse
-from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
+from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span

 logger = get_logger(name=__name__, category="core::routers")

@ -70,14 +65,14 @@ class InferenceRouter(Inference):
    def __init__(
        self,
        routing_table: RoutingTable,
+        telemetry: Telemetry | None = None,
        store: InferenceStore | None = None,
-        telemetry_enabled: bool = False,
    ) -> None:
        logger.debug("Initializing InferenceRouter")
        self.routing_table = routing_table
-        self.telemetry_enabled = telemetry_enabled
+        self.telemetry = telemetry
        self.store = store
-        if self.telemetry_enabled:
+        if self.telemetry:
            self.tokenizer = Tokenizer.get_instance()
            self.formatter = ChatFormat(self.tokenizer)

@ -159,7 +154,7 @@ class InferenceRouter(Inference):
        model: Model,
    ) -> list[MetricInResponse]:
        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
-        if self.telemetry_enabled:
+        if self.telemetry:
            for metric in metrics:
                enqueue_event(metric)
        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
@ -187,23 +182,6 @@ class InferenceRouter(Inference):
            raise ModelTypeError(model_id, model.model_type, expected_model_type)
        return model

-    async def rerank(
-        self,
-        model: str,
-        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
-        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
-        max_num_results: int | None = None,
-    ) -> RerankResponse:
-        logger.debug(f"InferenceRouter.rerank: {model}")
-        model_obj = await self._get_model(model, ModelType.rerank)
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.rerank(
-            model=model_obj.identifier,
-            query=query,
-            items=items,
-            max_num_results=max_num_results,
-        )
-
    async def openai_completion(
        self,
        params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
@ -223,7 +201,7 @@ class InferenceRouter(Inference):
            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.

        response = await provider.openai_completion(params)
-        if self.telemetry_enabled:
+        if self.telemetry:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -285,7 +263,7 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))

-        if self.telemetry_enabled:
+        if self.telemetry:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -393,7 +371,7 @@ class InferenceRouter(Inference):
            else:
                if hasattr(chunk, "delta"):
                    completion_text += chunk.delta
-                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry_enabled:
+                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
                    complete = True
                    completion_tokens = await self._count_tokens(completion_text)
            # if we are done receiving tokens
@ -401,7 +379,7 @@ class InferenceRouter(Inference):
                total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)

                # Create a separate span for streaming completion metrics
-                if self.telemetry_enabled:
+                if self.telemetry:
                    # Log metrics in the new span context
                    completion_metrics = self._construct_metrics(
                        prompt_tokens=prompt_tokens,
@ -450,7 +428,7 @@ class InferenceRouter(Inference):
        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)

        # Create a separate span for completion metrics
-        if self.telemetry_enabled:
+        if self.telemetry:
            # Log metrics in the new span context
            completion_metrics = self._construct_metrics(
                prompt_tokens=prompt_tokens,
@ -548,7 +526,7 @@ class InferenceRouter(Inference):
                        completion_text += "".join(choice_data["content_parts"])

                    # Add metrics to the chunk
-                    if self.telemetry_enabled and hasattr(chunk, "usage") and chunk.usage:
+                    if self.telemetry and hasattr(chunk, "usage") and chunk.usage:
                        metrics = self._construct_metrics(
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@ -10,7 +10,6 @@ from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.safety.safety import ModerationObject
 from llama_stack.apis.shields import Shield
-from llama_stack.core.datatypes import SafetyConfig
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable

@ -21,11 +20,9 @@ class SafetyRouter(Safety):
    def __init__(
        self,
        routing_table: RoutingTable,
-        safety_config: SafetyConfig | None = None,
    ) -> None:
        logger.debug("Initializing SafetyRouter")
        self.routing_table = routing_table
-        self.safety_config = safety_config

    async def initialize(self) -> None:
        logger.debug("SafetyRouter.initialize")
@ -63,47 +60,26 @@ class SafetyRouter(Safety):
            params=params,
        )

-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
-        list_shields_response = await self.routing_table.list_shields()
-        shields = list_shields_response.data
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
+        async def get_shield_id(self, model: str) -> str:
+            """Get Shield id from model (provider_resource_id) of shield."""
+            list_shields_response = await self.routing_table.list_shields()

-        selected_shield: Shield | None = None
-        provider_model: str | None = model
+            matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]

-        if model:
-            matches: list[Shield] = [s for s in shields if model == s.provider_resource_id]
            if not matches:
-                raise ValueError(
-                    f"No shield associated with provider_resource id {model}: choose from {[s.provider_resource_id for s in shields]}"
-                )
+                raise ValueError(f"No shield associated with provider_resource id {model}")
            if len(matches) > 1:
-                raise ValueError(
-                    f"Multiple shields associated with provider_resource id {model}: matched shields {[s.identifier for s in matches]}"
-                )
-            selected_shield = matches[0]
-        else:
-            default_shield_id = self.safety_config.default_shield_id if self.safety_config else None
-            if not default_shield_id:
-                raise ValueError(
-                    "No moderation model specified and no default_shield_id configured in safety config: select model "
-                    f"from {[s.provider_resource_id or s.identifier for s in shields]}"
-                )
+                raise ValueError(f"Multiple shields associated with provider_resource id {model}")
+            return matches[0]

-            selected_shield = next((s for s in shields if s.identifier == default_shield_id), None)
-            if selected_shield is None:
-                raise ValueError(
-                    f"Default moderation model not found. Choose from {[s.provider_resource_id or s.identifier for s in shields]}."
-                )
-
-            provider_model = selected_shield.provider_resource_id
-
-        shield_id = selected_shield.identifier
+        shield_id = await get_shield_id(self, model)
        logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
        provider = await self.routing_table.get_provider_impl(shield_id)

        response = await provider.run_moderation(
            input=input,
-            model=provider_model,
+            model=model,
        )

        return response
--- a/llama_stack/core/routers/tool_runtime.py
+++ b/llama_stack/core/routers/tool_runtime.py
@ -37,24 +37,24 @@ class ToolRuntimeRouter(ToolRuntime):
        async def query(
            self,
            content: InterleavedContent,
-            vector_store_ids: list[str],
+            vector_db_ids: list[str],
            query_config: RAGQueryConfig | None = None,
        ) -> RAGQueryResult:
-            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
+            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
            provider = await self.routing_table.get_provider_impl("knowledge_search")
-            return await provider.query(content, vector_store_ids, query_config)
+            return await provider.query(content, vector_db_ids, query_config)

        async def insert(
            self,
            documents: list[RAGDocument],
-            vector_store_id: str,
+            vector_db_id: str,
            chunk_size_in_tokens: int = 512,
        ) -> None:
            logger.debug(
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
+                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
            )
            provider = await self.routing_table.get_provider_impl("insert_into_memory")
-            return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
+            return await provider.insert(documents, vector_db_id, chunk_size_in_tokens)

    def __init__(
        self,
--- a/llama_stack/core/routers/vector_io.py
+++ b/llama_stack/core/routers/vector_io.py
@ -71,6 +71,25 @@ class VectorIORouter(VectorIO):

        raise ValueError(f"Embedding model '{embedding_model_id}' not found or not an embedding model")

+    async def register_vector_db(
+        self,
+        vector_db_id: str,
+        embedding_model: str,
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        vector_db_name: str | None = None,
+        provider_vector_db_id: str | None = None,
+    ) -> None:
+        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
+        await self.routing_table.register_vector_db(
+            vector_db_id,
+            embedding_model,
+            embedding_dimension,
+            provider_id,
+            vector_db_name,
+            provider_vector_db_id,
+        )
+
    async def insert_chunks(
        self,
        vector_db_id: str,
@ -146,22 +165,22 @@ class VectorIORouter(VectorIO):
            else:
                provider_id = list(self.routing_table.impls_by_provider_id.keys())[0]

-        vector_store_id = f"vs_{uuid.uuid4()}"
-        registered_vector_store = await self.routing_table.register_vector_store(
-            vector_store_id=vector_store_id,
+        vector_db_id = f"vs_{uuid.uuid4()}"
+        registered_vector_db = await self.routing_table.register_vector_db(
+            vector_db_id=vector_db_id,
            embedding_model=embedding_model,
            embedding_dimension=embedding_dimension,
            provider_id=provider_id,
-            provider_vector_store_id=vector_store_id,
-            vector_store_name=params.name,
+            provider_vector_db_id=vector_db_id,
+            vector_db_name=params.name,
        )
-        provider = await self.routing_table.get_provider_impl(registered_vector_store.identifier)
+        provider = await self.routing_table.get_provider_impl(registered_vector_db.identifier)

-        # Update model_extra with registered values so provider uses the already-registered vector_store
+        # Update model_extra with registered values so provider uses the already-registered vector_db
        if params.model_extra is None:
            params.model_extra = {}
-        params.model_extra["provider_vector_store_id"] = registered_vector_store.provider_resource_id
-        params.model_extra["provider_id"] = registered_vector_store.provider_id
+        params.model_extra["provider_vector_db_id"] = registered_vector_db.provider_resource_id
+        params.model_extra["provider_id"] = registered_vector_db.provider_id
        if embedding_model is not None:
            params.model_extra["embedding_model"] = embedding_model
        if embedding_dimension is not None:
@ -179,15 +198,15 @@ class VectorIORouter(VectorIO):
        logger.debug(f"VectorIORouter.openai_list_vector_stores: limit={limit}")
        # Route to default provider for now - could aggregate from all providers in the future
        # call retrieve on each vector dbs to get list of vector stores
-        vector_stores = await self.routing_table.get_all_with_type("vector_store")
+        vector_dbs = await self.routing_table.get_all_with_type("vector_db")
        all_stores = []
-        for vector_store in vector_stores:
+        for vector_db in vector_dbs:
            try:
-                provider = await self.routing_table.get_provider_impl(vector_store.identifier)
-                vector_store = await provider.openai_retrieve_vector_store(vector_store.identifier)
+                provider = await self.routing_table.get_provider_impl(vector_db.identifier)
+                vector_store = await provider.openai_retrieve_vector_store(vector_db.identifier)
                all_stores.append(vector_store)
            except Exception as e:
-                logger.error(f"Error retrieving vector store {vector_store.identifier}: {e}")
+                logger.error(f"Error retrieving vector store {vector_db.identifier}: {e}")
                continue

        # Sort by created_at
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -41,7 +41,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
    elif api == Api.safety:
        return await p.register_shield(obj)
    elif api == Api.vector_io:
-        return await p.register_vector_store(obj)
+        return await p.register_vector_db(obj)
    elif api == Api.datasetio:
        return await p.register_dataset(obj)
    elif api == Api.scoring:
@ -57,7 +57,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
 async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
    api = get_impl_api(p)
    if api == Api.vector_io:
-        return await p.unregister_vector_store(obj.identifier)
+        return await p.unregister_vector_db(obj.identifier)
    elif api == Api.inference:
        return await p.unregister_model(obj.identifier)
    elif api == Api.safety:
@ -108,7 +108,7 @@ class CommonRoutingTableImpl(RoutingTable):
            elif api == Api.safety:
                p.shield_store = self
            elif api == Api.vector_io:
-                p.vector_store_store = self
+                p.vector_db_store = self
            elif api == Api.datasetio:
                p.dataset_store = self
            elif api == Api.scoring:
@ -134,15 +134,15 @@ class CommonRoutingTableImpl(RoutingTable):
        from .scoring_functions import ScoringFunctionsRoutingTable
        from .shields import ShieldsRoutingTable
        from .toolgroups import ToolGroupsRoutingTable
-        from .vector_stores import VectorStoresRoutingTable
+        from .vector_dbs import VectorDBsRoutingTable

        def apiname_object():
            if isinstance(self, ModelsRoutingTable):
                return ("Inference", "model")
            elif isinstance(self, ShieldsRoutingTable):
                return ("Safety", "shield")
-            elif isinstance(self, VectorStoresRoutingTable):
-                return ("VectorIO", "vector_store")
+            elif isinstance(self, VectorDBsRoutingTable):
+                return ("VectorIO", "vector_db")
            elif isinstance(self, DatasetsRoutingTable):
                return ("DatasetIO", "dataset")
            elif isinstance(self, ScoringFunctionsRoutingTable):
--- a/llama_stack/core/routing_tables/vector_stores.py
+++ b/llama_stack/core/routing_tables/vector_stores.py
@ -6,12 +6,15 @@

 from typing import Any

+from pydantic import TypeAdapter
+
 from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.models import ModelType
 from llama_stack.apis.resource import ResourceType

-# Removed VectorStores import to avoid exposing public API
+# Removed VectorDBs import to avoid exposing public API
 from llama_stack.apis.vector_io.vector_io import (
+    OpenAICreateVectorStoreRequestWithExtraBody,
    SearchRankingOptions,
    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
@ -23,7 +26,7 @@ from llama_stack.apis.vector_io.vector_io import (
    VectorStoreSearchResponsePage,
 )
 from llama_stack.core.datatypes import (
-    VectorStoreWithOwner,
+    VectorDBWithOwner,
 )
 from llama_stack.log import get_logger

@ -32,23 +35,23 @@ from .common import CommonRoutingTableImpl, lookup_model
 logger = get_logger(name=__name__, category="core::routing_tables")


-class VectorStoresRoutingTable(CommonRoutingTableImpl):
-    """Internal routing table for vector_store operations.
+class VectorDBsRoutingTable(CommonRoutingTableImpl):
+    """Internal routing table for vector_db operations.

-    Does not inherit from VectorStores to avoid exposing public API endpoints.
+    Does not inherit from VectorDBs to avoid exposing public API endpoints.
    Only provides internal routing functionality for VectorIORouter.
    """

    # Internal methods only - no public API exposure

-    async def register_vector_store(
+    async def register_vector_db(
        self,
-        vector_store_id: str,
+        vector_db_id: str,
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
-        provider_vector_store_id: str | None = None,
-        vector_store_name: str | None = None,
+        provider_vector_db_id: str | None = None,
+        vector_db_name: str | None = None,
    ) -> Any:
        if provider_id is None:
            if len(self.impls_by_provider_id) > 0:
@ -64,24 +67,52 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
            raise ModelNotFoundError(embedding_model)
        if model.model_type != ModelType.embedding:
            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
+        if "embedding_dimension" not in model.metadata:
+            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")

-        vector_store = VectorStoreWithOwner(
-            identifier=vector_store_id,
-            type=ResourceType.vector_store.value,
-            provider_id=provider_id,
-            provider_resource_id=provider_vector_store_id,
-            embedding_model=embedding_model,
-            embedding_dimension=embedding_dimension,
-            vector_store_name=vector_store_name,
+        try:
+            provider = self.impls_by_provider_id[provider_id]
+        except KeyError:
+            available_providers = list(self.impls_by_provider_id.keys())
+            raise ValueError(
+                f"Provider '{provider_id}' not found in routing table. Available providers: {available_providers}"
+            ) from None
+        logger.warning(
+            "VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
        )
-        await self.register_object(vector_store)
-        return vector_store
+        request = OpenAICreateVectorStoreRequestWithExtraBody(
+            name=vector_db_name or vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=model.metadata["embedding_dimension"],
+            provider_id=provider_id,
+            provider_vector_db_id=provider_vector_db_id,
+        )
+        vector_store = await provider.openai_create_vector_store(request)
+
+        vector_store_id = vector_store.id
+        actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
+        logger.warning(
+            f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
+        )
+
+        vector_db_data = {
+            "identifier": vector_store_id,
+            "type": ResourceType.vector_db.value,
+            "provider_id": provider_id,
+            "provider_resource_id": actual_provider_vector_db_id,
+            "embedding_model": embedding_model,
+            "embedding_dimension": model.metadata["embedding_dimension"],
+            "vector_db_name": vector_store.name,
+        }
+        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
+        await self.register_object(vector_db)
+        return vector_db

    async def openai_retrieve_vector_store(
        self,
        vector_store_id: str,
    ) -> VectorStoreObject:
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store(vector_store_id)

@ -92,7 +123,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        expires_after: dict[str, Any] | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> VectorStoreObject:
-        await self.assert_action_allowed("update", "vector_store", vector_store_id)
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_update_vector_store(
            vector_store_id=vector_store_id,
@ -105,18 +136,18 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        self,
        vector_store_id: str,
    ) -> VectorStoreDeleteResponse:
-        await self.assert_action_allowed("delete", "vector_store", vector_store_id)
+        await self.assert_action_allowed("delete", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        result = await provider.openai_delete_vector_store(vector_store_id)
-        await self.unregister_vector_store(vector_store_id)
+        await self.unregister_vector_db(vector_store_id)
        return result

-    async def unregister_vector_store(self, vector_store_id: str) -> None:
+    async def unregister_vector_db(self, vector_store_id: str) -> None:
        """Remove the vector store from the routing table registry."""
        try:
-            vector_store_obj = await self.get_object_by_identifier("vector_store", vector_store_id)
-            if vector_store_obj:
-                await self.unregister_object(vector_store_obj)
+            vector_db_obj = await self.get_object_by_identifier("vector_db", vector_store_id)
+            if vector_db_obj:
+                await self.unregister_object(vector_db_obj)
        except Exception as e:
            # Log the error but don't fail the operation
            logger.warning(f"Failed to unregister vector store {vector_store_id} from routing table: {e}")
@ -131,7 +162,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        rewrite_query: bool | None = False,
        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_search_vector_store(
            vector_store_id=vector_store_id,
@ -150,7 +181,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        attributes: dict[str, Any] | None = None,
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
-        await self.assert_action_allowed("update", "vector_store", vector_store_id)
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
@ -168,7 +199,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        before: str | None = None,
        filter: VectorStoreFileStatus | None = None,
    ) -> list[VectorStoreFileObject]:
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_list_files_in_vector_store(
            vector_store_id=vector_store_id,
@ -184,7 +215,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        vector_store_id: str,
        file_id: str,
    ) -> VectorStoreFileObject:
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store_file(
            vector_store_id=vector_store_id,
@ -196,7 +227,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        vector_store_id: str,
        file_id: str,
    ) -> VectorStoreFileContentsResponse:
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
@ -209,7 +240,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        file_id: str,
        attributes: dict[str, Any],
    ) -> VectorStoreFileObject:
-        await self.assert_action_allowed("update", "vector_store", vector_store_id)
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_update_vector_store_file(
            vector_store_id=vector_store_id,
@ -222,7 +253,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        vector_store_id: str,
        file_id: str,
    ) -> VectorStoreFileDeleteResponse:
-        await self.assert_action_allowed("delete", "vector_store", vector_store_id)
+        await self.assert_action_allowed("delete", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_delete_vector_store_file(
            vector_store_id=vector_store_id,
@ -236,7 +267,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        attributes: dict[str, Any] | None = None,
        chunking_strategy: Any | None = None,
    ):
-        await self.assert_action_allowed("update", "vector_store", vector_store_id)
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_create_vector_store_file_batch(
            vector_store_id=vector_store_id,
@ -250,7 +281,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        batch_id: str,
        vector_store_id: str,
    ):
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store_file_batch(
            batch_id=batch_id,
@ -267,7 +298,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        limit: int | None = 20,
        order: str | None = "desc",
    ):
-        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_list_files_in_vector_store_file_batch(
            batch_id=batch_id,
@ -284,7 +315,7 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        batch_id: str,
        vector_store_id: str,
    ):
-        await self.assert_action_allowed("update", "vector_store", vector_store_id)
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_cancel_vector_store_file_batch(
            batch_id=batch_id,
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -36,6 +36,7 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
+    LoggingConfig,
    StackRunConfig,
    process_cors_config,
 )
@ -52,13 +53,19 @@ from llama_stack.core.stack import (
    cast_image_name_to_string,
    replace_env_vars,
 )
-from llama_stack.core.telemetry import Telemetry
-from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, setup_logger
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.core.utils.context import preserve_contexts_async_generator
-from llama_stack.log import LoggingConfig, get_logger, setup_logging
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
+from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig
+from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
+    TelemetryAdapter,
+)
+from llama_stack.providers.utils.telemetry.tracing import (
+    CURRENT_TRACE_CONTEXT,
+    setup_logger,
+)

 from .auth import AuthenticationMiddleware
 from .quota import QuotaMiddleware
@ -167,9 +174,7 @@ class StackApp(FastAPI):

@asynccontextmanager
 async def lifespan(app: StackApp):
-    server_version = parse_version("llama-stack")
-
-    logger.info(f"Starting up Llama Stack server (version: {server_version})")
+    logger.info("Starting up")
    assert app.stack is not None
    app.stack.create_registry_refresh_task()
    yield
@ -369,9 +374,6 @@ def create_app() -> StackApp:
    Returns:
        Configured StackApp instance.
    """
-    # Initialize logging from environment variables first
-    setup_logging()
-
    config_file = os.getenv("LLAMA_STACK_CONFIG")
    if config_file is None:
        raise ValueError("LLAMA_STACK_CONFIG environment variable is required")
@ -444,7 +446,9 @@ def create_app() -> StackApp:
            app.add_middleware(CORSMiddleware, **cors_config.model_dump())

    if config.telemetry.enabled:
-        setup_logger(Telemetry())
+        setup_logger(impls[Api.telemetry])
+    else:
+        setup_logger(TelemetryAdapter(TelemetryConfig(), {}))

    # Load external APIs if configured
    external_apis = load_external_apis(config)
@ -502,8 +506,7 @@ def create_app() -> StackApp:
    app.exception_handler(RequestValidationError)(global_exception_handler)
    app.exception_handler(Exception)(global_exception_handler)

-    if config.telemetry.enabled:
-        app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)
+    app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)

    return app

--- a/llama_stack/core/server/tracing.py
+++ b/llama_stack/core/server/tracing.py
@ -7,8 +7,8 @@ from aiohttp import hdrs

 from llama_stack.core.external import ExternalApiSpec
 from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
-from llama_stack.core.telemetry.tracing import end_trace, start_trace
 from llama_stack.log import get_logger
+from llama_stack.providers.utils.telemetry.tracing import end_trace, start_trace

 logger = get_logger(name=__name__, category="core::server")

--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -35,7 +35,7 @@ from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
-from llama_stack.core.datatypes import Provider, SafetyConfig, StackRunConfig, VectorStoresConfig
+from llama_stack.core.datatypes import Provider, StackRunConfig, VectorStoresConfig
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl
 from llama_stack.core.prompts.prompts import PromptServiceConfig, PromptServiceImpl
@ -175,30 +175,6 @@ async def validate_vector_stores_config(vector_stores_config: VectorStoresConfig
    logger.debug(f"Validated default embedding model: {default_model_id} (dimension: {embedding_dimension})")


-async def validate_safety_config(safety_config: SafetyConfig | None, impls: dict[Api, Any]):
-    if safety_config is None or safety_config.default_shield_id is None:
-        return
-
-    if Api.shields not in impls:
-        raise ValueError("Safety configuration requires the shields API to be enabled")
-
-    if Api.safety not in impls:
-        raise ValueError("Safety configuration requires the safety API to be enabled")
-
-    shields_impl = impls[Api.shields]
-    response = await shields_impl.list_shields()
-    shields_by_id = {shield.identifier: shield for shield in response.data}
-
-    default_shield_id = safety_config.default_shield_id
-    # don't validate if there are no shields registered
-    if shields_by_id and default_shield_id not in shields_by_id:
-        available = sorted(shields_by_id)
-        raise ValueError(
-            f"Configured default_shield_id '{default_shield_id}' not found among registered shields."
-            f" Available shields: {available}"
-        )
-
-
 class EnvVarError(Exception):
    def __init__(self, var_name: str, path: str = ""):
        self.var_name = var_name
@ -436,7 +412,6 @@ class Stack:
        await register_resources(self.run_config, impls)
        await refresh_registry_once(impls)
        await validate_vector_stores_config(self.run_config.vector_stores, impls)
-        await validate_safety_config(self.run_config.safety, impls)
        self.impls = impls

    def create_registry_refresh_task(self):
--- a/llama_stack/core/telemetry/init.py
+++ b/llama_stack/core/telemetry/init.py
@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .telemetry import Telemetry
-from .trace_protocol import serialize_value, trace_protocol
-from .tracing import (
-    CURRENT_TRACE_CONTEXT,
-    ROOT_SPAN_MARKERS,
-    end_trace,
-    enqueue_event,
-    get_current_span,
-    setup_logger,
-    span,
-    start_trace,
-)
-
-__all__ = [
-    "Telemetry",
-    "trace_protocol",
-    "serialize_value",
-    "CURRENT_TRACE_CONTEXT",
-    "ROOT_SPAN_MARKERS",
-    "end_trace",
-    "enqueue_event",
-    "get_current_span",
-    "setup_logger",
-    "span",
-    "start_trace",
-]
--- a/llama_stack/core/ui/README.md
+++ b/llama_stack/core/ui/README.md
@ -9,7 +9,7 @@
 1. Start up Llama Stack API server. More details [here](https://llamastack.github.io/latest/getting_started/index.htmll).

 ```
-llama stack list-deps together | xargs -L1 uv pip install
+llama stack build --distro together --image-type venv

 llama stack run together
 ```
--- a/llama_stack/core/ui/page/playground/tools.py
+++ b/llama_stack/core/ui/page/playground/tools.py
@ -32,7 +32,7 @@ def tool_chat_page():
    tool_groups_list = [tool_group.identifier for tool_group in tool_groups]
    mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")]
    builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")]
-    selected_vector_stores = []
+    selected_vector_dbs = []

    def reset_agent():
        st.session_state.clear()
@ -55,13 +55,13 @@ def tool_chat_page():
        )

        if "builtin::rag" in toolgroup_selection:
-            vector_stores = llama_stack_api.client.vector_stores.list() or []
-            if not vector_stores:
+            vector_dbs = llama_stack_api.client.vector_dbs.list() or []
+            if not vector_dbs:
                st.info("No vector databases available for selection.")
-            vector_stores = [vector_store.identifier for vector_store in vector_stores]
-            selected_vector_stores = st.multiselect(
+            vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
+            selected_vector_dbs = st.multiselect(
                label="Select Document Collections to use in RAG queries",
-                options=vector_stores,
+                options=vector_dbs,
                on_change=reset_agent,
            )

@ -119,7 +119,7 @@ def tool_chat_page():
            tool_dict = dict(
                name="builtin::rag",
                args={
-                    "vector_store_ids": list(selected_vector_stores),
+                    "vector_db_ids": list(selected_vector_dbs),
                },
            )
            toolgroup_selection[i] = tool_dict
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -274,5 +274,3 @@ vector_stores:
  default_embedding_model:
    provider_id: sentence-transformers
    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/dell/doc_template.md
+++ b/llama_stack/distributions/dell/doc_template.md
@ -157,7 +157,7 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack list-deps {{ name }} | xargs -L1 pip install
+llama stack build --distro {{ name }} --image-type conda
 INFERENCE_MODEL=$INFERENCE_MODEL \
 DEH_URL=$DEH_URL \
 CHROMA_URL=$CHROMA_URL \
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -277,5 +277,3 @@ vector_stores:
  default_embedding_model:
    provider_id: sentence-transformers
    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -274,5 +274,3 @@ vector_stores:
  default_embedding_model:
    provider_id: sentence-transformers
    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -12,7 +12,6 @@ from llama_stack.core.datatypes import (
    Provider,
    ProviderSpec,
    QualifiedModel,
-    SafetyConfig,
    ShieldInput,
    ToolGroupInput,
    VectorStoresConfig,
@ -257,9 +256,6 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                        model_id="nomic-ai/nomic-embed-text-v1.5",
                    ),
                ),
-                safety_config=SafetyConfig(
-                    default_shield_id="llama-guard",
-                ),
            ),
        },
        run_config_env_vars={
--- a/llama_stack/distributions/template.py
+++ b/llama_stack/distributions/template.py
@ -24,7 +24,6 @@ from llama_stack.core.datatypes import (
    DistributionSpec,
    ModelInput,
    Provider,
-    SafetyConfig,
    ShieldInput,
    TelemetryConfig,
    ToolGroupInput,
@ -189,7 +188,6 @@ class RunConfigSettings(BaseModel):
    default_datasets: list[DatasetInput] | None = None
    default_benchmarks: list[BenchmarkInput] | None = None
    vector_stores_config: VectorStoresConfig | None = None
-    safety_config: SafetyConfig | None = None
    telemetry: TelemetryConfig = Field(default_factory=lambda: TelemetryConfig(enabled=True))
    storage_backends: dict[str, Any] | None = None
    storage_stores: dict[str, Any] | None = None
@ -292,9 +290,6 @@ class RunConfigSettings(BaseModel):
        if self.vector_stores_config:
            config["vector_stores"] = self.vector_stores_config.model_dump(exclude_none=True)

-        if self.safety_config:
-            config["safety"] = self.safety_config.model_dump(exclude_none=True)
-
        return config


--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@ -9,23 +9,15 @@ import os
 import re
 from logging.config import dictConfig  # allow-direct-logging

-from pydantic import BaseModel, Field
 from rich.console import Console
 from rich.errors import MarkupError
 from rich.logging import RichHandler

+from llama_stack.core.datatypes import LoggingConfig
+
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO

-
-class LoggingConfig(BaseModel):
-    category_levels: dict[str, str] = Field(
-        default_factory=dict,
-        description="""
-Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
-    )
-
-
 # Predefined categories
 CATEGORIES = [
    "core",
@ -145,8 +137,7 @@ class CustomRichHandler(RichHandler):
        # Set a reasonable default width for console output, especially when redirected to files
        console_width = int(os.environ.get("LLAMA_STACK_LOG_WIDTH", "120"))
        # Don't force terminal codes to avoid ANSI escape codes in log files
-        # Ensure logs go to stderr, not stdout
-        kwargs["console"] = Console(width=console_width, stderr=True)
+        kwargs["console"] = Console(width=console_width)
        super().__init__(*args, **kwargs)

    def emit(self, record):
@ -175,30 +166,14 @@ class CustomFileHandler(logging.FileHandler):
        super().emit(record)


-def setup_logging(category_levels: dict[str, int] | None = None, log_file: str | None = None) -> None:
+def setup_logging(category_levels: dict[str, int], log_file: str | None) -> None:
    """
    Configure logging based on the provided category log levels and an optional log file.
-    If category_levels or log_file are not provided, they will be read from environment variables.

    Parameters:
-        category_levels (Dict[str, int] | None): A dictionary mapping categories to their log levels.
-            If None, reads from LLAMA_STACK_LOGGING environment variable and uses defaults.
-        log_file (str | None): Path to a log file to additionally pipe the logs into.
-            If None, reads from LLAMA_STACK_LOG_FILE environment variable.
+        category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
+        log_file (str): Path to a log file to additionally pipe the logs into
    """
-    global _category_levels
-    # Read from environment variables if not explicitly provided
-    if category_levels is None:
-        category_levels = dict.fromkeys(CATEGORIES, DEFAULT_LOG_LEVEL)
-        env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
-        if env_config:
-            category_levels.update(parse_environment_config(env_config))
-
-    # Update the module-level _category_levels so that already-created loggers pick up the new levels
-    _category_levels.update(category_levels)
-
-    if log_file is None:
-        log_file = os.environ.get("LLAMA_STACK_LOG_FILE")
    log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s"

    class CategoryFilter(logging.Filter):
@ -249,30 +224,12 @@ def setup_logging(category_levels: dict[str, int] | None = None, log_file: str |
            }
        },
        "loggers": {
-            **{
-                category: {
-                    "handlers": list(handlers.keys()),  # Apply all handlers
-                    "level": category_levels.get(category, DEFAULT_LOG_LEVEL),
-                    "propagate": False,  # Disable propagation to root logger
-                }
-                for category in CATEGORIES
-            },
-            # Explicitly configure uvicorn loggers to preserve their INFO level
-            "uvicorn": {
-                "handlers": list(handlers.keys()),
-                "level": logging.INFO,
-                "propagate": False,
-            },
-            "uvicorn.error": {
-                "handlers": list(handlers.keys()),
-                "level": logging.INFO,
-                "propagate": False,
-            },
-            "uvicorn.access": {
-                "handlers": list(handlers.keys()),
-                "level": logging.INFO,
-                "propagate": False,
-            },
+            category: {
+                "handlers": list(handlers.keys()),  # Apply all handlers
+                "level": category_levels.get(category, DEFAULT_LOG_LEVEL),
+                "propagate": False,  # Disable propagation to root logger
+            }
+            for category in CATEGORIES
        },
        "root": {
            "handlers": list(handlers.keys()),
@ -281,18 +238,10 @@ def setup_logging(category_levels: dict[str, int] | None = None, log_file: str |
    }
    dictConfig(logging_config)

-    # Update log levels for all loggers that were created before setup_logging was called
-    for name, logger in logging.root.manager.loggerDict.items():
+    # Ensure third-party libraries follow the root log level
+    for _, logger in logging.root.manager.loggerDict.items():
        if isinstance(logger, logging.Logger):
-            # Skip infrastructure loggers (uvicorn, fastapi) to preserve their configured levels
-            if name.startswith(("uvicorn", "fastapi")):
-                continue
-            # Update llama_stack loggers if root level was explicitly set (e.g., via all=CRITICAL)
-            if name.startswith("llama_stack") and "root" in category_levels:
-                logger.setLevel(root_level)
-            # Update third-party library loggers
-            elif not name.startswith("llama_stack"):
-                logger.setLevel(root_level)
+            logger.setLevel(root_level)


 def get_logger(
@ -329,3 +278,12 @@ def get_logger(
            log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
    logger.setLevel(log_level)
    return logging.LoggerAdapter(logger, {"category": category})
+
+
+env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
+if env_config:
+    _category_levels.update(parse_environment_config(env_config))
+
+log_file = os.environ.get("LLAMA_STACK_LOG_FILE")
+
+setup_logging(_category_levels, log_file)
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -17,7 +17,7 @@ from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.tools import ToolGroup
-from llama_stack.apis.vector_stores import VectorStore
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.schema_utils import json_schema_type


@ -68,10 +68,10 @@ class ShieldsProtocolPrivate(Protocol):
    async def unregister_shield(self, identifier: str) -> None: ...


-class VectorStoresProtocolPrivate(Protocol):
-    async def register_vector_store(self, vector_store: VectorStore) -> None: ...
+class VectorDBsProtocolPrivate(Protocol):
+    async def register_vector_db(self, vector_db: VectorDB) -> None: ...

-    async def unregister_vector_store(self, vector_store_id: str) -> None: ...
+    async def unregister_vector_db(self, vector_db_id: str) -> None: ...


 class DatasetsProtocolPrivate(Protocol):
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -67,7 +67,6 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.datatypes import AccessRule
-from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
@ -79,6 +78,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
    convert_tooldef_to_openai_tool,
 )
 from llama_stack.providers.utils.kvstore import KVStore
+from llama_stack.providers.utils.telemetry import tracing

 from .persistence import AgentPersistence
 from .safety import SafetyException, ShieldRunnerMixin
--- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -131,7 +131,7 @@ class OpenAIResponsesImpl:

            tool_context.recover_tools_from_previous_response(previous_response)
        elif conversation is not None:
-            conversation_items = await self.conversations_api.list_items(conversation, order="asc")
+            conversation_items = await self.conversations_api.list(conversation, order="asc")

            # Use stored messages as source of truth (like previous_response.messages)
            stored_messages = await self.responses_store.get_conversation_messages(conversation)
@ -372,13 +372,14 @@ class OpenAIResponsesImpl:
                final_response = stream_chunk.response
            elif stream_chunk.type == "response.failed":
                failed_response = stream_chunk.response
+            yield stream_chunk

            if stream_chunk.type == "response.output_item.done":
                item = stream_chunk.item
                output_items.append(item)

-            # Store and sync before yielding terminal events
-            # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
+            # Store and sync immediately after yielding terminal events
+            # This ensures the storage/syncing happens even if the consumer breaks early
            if (
                stream_chunk.type in {"response.completed", "response.incomplete"}
                and final_response
@ -399,8 +400,6 @@ class OpenAIResponsesImpl:
                    await self._sync_response_to_conversation(conversation, input, output_items)
                    await self.responses_store.store_conversation_messages(conversation, messages_to_store)

-            yield stream_chunk
-
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
        return await self.responses_store.delete_response_object(response_id)

--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -65,9 +65,9 @@ from llama_stack.apis.inference import (
    OpenAIChoice,
    OpenAIMessageParam,
 )
-from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
+from llama_stack.providers.utils.telemetry import tracing

 from .types import ChatCompletionContext, ChatCompletionResult
 from .utils import (
--- a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -37,8 +37,8 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
+from llama_stack.providers.utils.telemetry import tracing

 from .types import ChatCompletionContext, ToolExecutionResult

--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -8,8 +8,8 @@ import asyncio

 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
-from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
+from llama_stack.providers.utils.telemetry import tracing

 log = get_logger(name=__name__, category="agents::meta_reference")

--- a/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
+++ b/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
@ -101,10 +101,7 @@ class MetaReferenceCodeScannerSafetyImpl(Safety):
            metadata=metadata,
        )

-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
-        if model is None:
-            raise ValueError("Code scanner moderation requires a model identifier.")
-
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        inputs = input if isinstance(input, list) else [input]
        results = []

--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -200,10 +200,7 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):

        return await impl.run(messages)

-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
-        if model is None:
-            raise ValueError("Llama Guard moderation requires a model identifier.")
-
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        if isinstance(input, list):
            messages = input.copy()
        else:
--- a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
@ -63,7 +63,7 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):

        return await self.shield.run(messages)

-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        raise NotImplementedError("run_moderation is not implemented for Prompt Guard")


--- a/llama_stack/providers/inline/telemetry/init.py
+++ b/llama_stack/providers/inline/telemetry/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/telemetry/meta_reference/init.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/init.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.core.datatypes import Api
+
+from .config import TelemetryConfig, TelemetrySink
+
+__all__ = ["TelemetryConfig", "TelemetrySink"]
+
+
+async def get_provider_impl(config: TelemetryConfig, deps: dict[Api, Any]):
+    from .telemetry import TelemetryAdapter
+
+    impl = TelemetryAdapter(config, deps)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import StrEnum
+from typing import Any
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class TelemetrySink(StrEnum):
+    OTEL_TRACE = "otel_trace"
+    OTEL_METRIC = "otel_metric"
+    CONSOLE = "console"
+
+
+class TelemetryConfig(BaseModel):
+    otel_exporter_otlp_endpoint: str | None = Field(
+        default=None,
+        description="The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable.",
+    )
+    service_name: str = Field(
+        # service name is always the same, use zero-width space to avoid clutter
+        default="\u200b",
+        description="The service name to use for telemetry",
+    )
+    sinks: list[TelemetrySink] = Field(
+        default_factory=list,
+        description="List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console)",
+    )
+
+    @field_validator("sinks", mode="before")
+    @classmethod
+    def validate_sinks(cls, v):
+        if isinstance(v, str):
+            return [TelemetrySink(sink.strip()) for sink in v.split(",")]
+        return v or []
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
+        return {
+            "service_name": "${env.OTEL_SERVICE_NAME:=\u200b}",
+            "sinks": "${env.TELEMETRY_SINKS:=}",
+            "otel_exporter_otlp_endpoint": "${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}",
+        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+from datetime import UTC, datetime
+
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.sdk.trace.export import SpanProcessor
+from opentelemetry.trace.status import StatusCode
+
+from llama_stack.log import get_logger
+
+logger = get_logger(name="console_span_processor", category="telemetry")
+
+
+class ConsoleSpanProcessor(SpanProcessor):
+    def __init__(self, print_attributes: bool = False):
+        self.print_attributes = print_attributes
+
+    def on_start(self, span: ReadableSpan, parent_context=None) -> None:
+        if span.attributes and span.attributes.get("__autotraced__"):
+            return
+
+        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
+        logger.info(f"[dim]{timestamp}[/dim] [bold magenta][START][/bold magenta] [dim]{span.name}[/dim]")
+
+    def on_end(self, span: ReadableSpan) -> None:
+        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
+        span_context = f"[dim]{timestamp}[/dim] [bold magenta][END][/bold magenta] [dim]{span.name}[/dim]"
+        if span.status.status_code == StatusCode.ERROR:
+            span_context += " [bold red][ERROR][/bold red]"
+        elif span.status.status_code != StatusCode.UNSET:
+            span_context += f" [{span.status.status_code}]"
+        duration_ms = (span.end_time - span.start_time) / 1e6
+        span_context += f" ({duration_ms:.2f}ms)"
+        logger.info(span_context)
+
+        if self.print_attributes and span.attributes:
+            for key, value in span.attributes.items():
+                if key.startswith("__"):
+                    continue
+                str_value = str(value)
+                if len(str_value) > 1000:
+                    str_value = str_value[:997] + "..."
+                logger.info(f"    [dim]{key}[/dim]: {str_value}")
+
+        for event in span.events:
+            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
+            severity = event.attributes.get("severity", "info")
+            message = event.attributes.get("message", event.name)
+            if isinstance(message, dict) or isinstance(message, list):
+                message = json.dumps(message, indent=2)
+            severity_color = {
+                "error": "red",
+                "warn": "yellow",
+                "info": "white",
+                "debug": "dim",
+            }.get(severity, "white")
+            logger.info(f" {event_time} [bold {severity_color}][{severity.upper()}][/bold {severity_color}] {message}")
+            if event.attributes:
+                for key, value in event.attributes.items():
+                    if key.startswith("__") or key in ["message", "severity"]:
+                        continue
+                    logger.info(f"[dim]{key}[/dim]: {value}")
+
+    def shutdown(self) -> None:
+        """Shutdown the processor."""
+        pass
+
+    def force_flush(self, timeout_millis: float | None = None) -> bool:
+        """Force flush any pending spans."""
+        return True
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -24,13 +24,14 @@ from llama_stack.apis.telemetry import (
    SpanStartPayload,
    SpanStatus,
    StructuredLogEvent,
+    Telemetry,
    UnstructuredLogEvent,
 )
-from llama_stack.apis.telemetry import (
-    Telemetry as TelemetryBase,
-)
-from llama_stack.core.telemetry.tracing import ROOT_SPAN_MARKERS
+from llama_stack.core.datatypes import Api
 from llama_stack.log import get_logger
+from llama_stack.providers.utils.telemetry.tracing import ROOT_SPAN_MARKERS
+
+from .config import TelemetryConfig

 _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "active_spans": {},
@ -49,8 +50,9 @@ def is_tracing_enabled(tracer):
        return span.is_recording()


-class Telemetry(TelemetryBase):
-    def __init__(self) -> None:
+class TelemetryAdapter(Telemetry):
+    def __init__(self, _config: TelemetryConfig, deps: dict[Api, Any]) -> None:
+        self.datasetio_api = deps.get(Api.datasetio)
        self.meter = None

        global _TRACER_PROVIDER
@ -77,10 +79,8 @@ class Telemetry(TelemetryBase):
                metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
                metric_provider = MeterProvider(metric_readers=[metric_reader])
                metrics.set_meter_provider(metric_provider)
-            self.is_otel_endpoint_set = True
        else:
            logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT is not set, skipping telemetry")
-            self.is_otel_endpoint_set = False

        self.meter = metrics.get_meter(__name__)
        self._lock = _global_lock
@ -89,8 +89,7 @@ class Telemetry(TelemetryBase):
        pass

    async def shutdown(self) -> None:
-        if self.is_otel_endpoint_set:
-            trace.get_tracer_provider().force_flush()
+        trace.get_tracer_provider().force_flush()

    async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None:
        if isinstance(event, UnstructuredLogEvent):
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -17,21 +17,21 @@ from numpy.typing import NDArray
 from llama_stack.apis.common.errors import VectorStoreNotFoundError
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InterleavedContent
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, VectorStoresProtocolPrivate
+from llama_stack.providers.datatypes import HealthResponse, HealthStatus, VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
+from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex

 from .config import FaissVectorIOConfig

 logger = get_logger(name=__name__, category="vector_io")

 VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:{VERSION}::"
+VECTOR_DBS_PREFIX = f"vector_dbs:{VERSION}::"
 FAISS_INDEX_PREFIX = f"faiss_index:{VERSION}::"
 OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:{VERSION}::"
 OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:{VERSION}::"
@ -176,28 +176,28 @@ class FaissIndex(EmbeddingIndex):
        )


-class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
+class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
        super().__init__(files_api=files_api, kvstore=None)
        self.config = config
        self.inference_api = inference_api
-        self.cache: dict[str, VectorStoreWithIndex] = {}
+        self.cache: dict[str, VectorDBWithIndex] = {}

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.persistence)
        # Load existing banks from kvstore
        start_key = VECTOR_DBS_PREFIX
        end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_stores = await self.kvstore.values_in_range(start_key, end_key)
+        stored_vector_dbs = await self.kvstore.values_in_range(start_key, end_key)

-        for vector_store_data in stored_vector_stores:
-            vector_store = VectorStore.model_validate_json(vector_store_data)
-            index = VectorStoreWithIndex(
-                vector_store,
-                await FaissIndex.create(vector_store.embedding_dimension, self.kvstore, vector_store.identifier),
+        for vector_db_data in stored_vector_dbs:
+            vector_db = VectorDB.model_validate_json(vector_db_data)
+            index = VectorDBWithIndex(
+                vector_db,
+                await FaissIndex.create(vector_db.embedding_dimension, self.kvstore, vector_db.identifier),
                self.inference_api,
            )
-            self.cache[vector_store.identifier] = index
+            self.cache[vector_db.identifier] = index

        # Load existing OpenAI vector stores into the in-memory cache
        await self.initialize_openai_vector_stores()
@ -222,31 +222,32 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoco
        except Exception as e:
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")

-    async def register_vector_store(self, vector_store: VectorStore) -> None:
+    async def register_vector_db(self, vector_db: VectorDB) -> None:
        assert self.kvstore is not None

-        key = f"{VECTOR_DBS_PREFIX}{vector_store.identifier}"
-        await self.kvstore.set(key=key, value=vector_store.model_dump_json())
+        key = f"{VECTOR_DBS_PREFIX}{vector_db.identifier}"
+        await self.kvstore.set(key=key, value=vector_db.model_dump_json())

        # Store in cache
-        self.cache[vector_store.identifier] = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=await FaissIndex.create(vector_store.embedding_dimension, self.kvstore, vector_store.identifier),
+        self.cache[vector_db.identifier] = VectorDBWithIndex(
+            vector_db=vector_db,
+            index=await FaissIndex.create(vector_db.embedding_dimension, self.kvstore, vector_db.identifier),
            inference_api=self.inference_api,
        )

-    async def list_vector_stores(self) -> list[VectorStore]:
-        return [i.vector_store for i in self.cache.values()]
+    async def list_vector_dbs(self) -> list[VectorDB]:
+        return [i.vector_db for i in self.cache.values()]

-    async def unregister_vector_store(self, vector_store_id: str) -> None:
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
        assert self.kvstore is not None

-        if vector_store_id not in self.cache:
+        if vector_db_id not in self.cache:
+            logger.warning(f"Vector DB {vector_db_id} not found")
            return

-        await self.cache[vector_store_id].index.delete()
-        del self.cache[vector_store_id]
-        await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}")
+        await self.cache[vector_db_id].index.delete()
+        del self.cache[vector_db_id]
+        await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_db_id}")

    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
        index = self.cache.get(vector_db_id)
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -17,10 +17,10 @@ from numpy.typing import NDArray
 from llama_stack.apis.common.errors import VectorStoreNotFoundError
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
+from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
@ -28,7 +28,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_RRF,
    ChunkForDeletion,
    EmbeddingIndex,
-    VectorStoreWithIndex,
+    VectorDBWithIndex,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator

@ -41,7 +41,7 @@ HYBRID_SEARCH = "hybrid"
 SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH}

 VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:sqlite_vec:{VERSION}::"
+VECTOR_DBS_PREFIX = f"vector_dbs:sqlite_vec:{VERSION}::"
 VECTOR_INDEX_PREFIX = f"vector_index:sqlite_vec:{VERSION}::"
 OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:sqlite_vec:{VERSION}::"
 OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:sqlite_vec:{VERSION}::"
@ -374,32 +374,32 @@ class SQLiteVecIndex(EmbeddingIndex):
        await asyncio.to_thread(_delete_chunks)


-class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
+class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    """
    A VectorIO implementation using SQLite + sqlite_vec.
-    This class handles vector database registration (with metadata stored in a table named `vector_stores`)
-    and creates a cache of VectorStoreWithIndex instances (each wrapping a SQLiteVecIndex).
+    This class handles vector database registration (with metadata stored in a table named `vector_dbs`)
+    and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
    """

    def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None:
        super().__init__(files_api=files_api, kvstore=None)
        self.config = config
        self.inference_api = inference_api
-        self.cache: dict[str, VectorStoreWithIndex] = {}
-        self.vector_store_table = None
+        self.cache: dict[str, VectorDBWithIndex] = {}
+        self.vector_db_store = None

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.persistence)

        start_key = VECTOR_DBS_PREFIX
        end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_stores = await self.kvstore.values_in_range(start_key, end_key)
-        for db_json in stored_vector_stores:
-            vector_store = VectorStore.model_validate_json(db_json)
+        stored_vector_dbs = await self.kvstore.values_in_range(start_key, end_key)
+        for db_json in stored_vector_dbs:
+            vector_db = VectorDB.model_validate_json(db_json)
            index = await SQLiteVecIndex.create(
-                vector_store.embedding_dimension, self.config.db_path, vector_store.identifier
+                vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
            )
-            self.cache[vector_store.identifier] = VectorStoreWithIndex(vector_store, index, self.inference_api)
+            self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

        # Load existing OpenAI vector stores into the in-memory cache
        await self.initialize_openai_vector_stores()
@ -408,64 +408,63 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresPro
        # Clean up mixin resources (file batch tasks)
        await super().shutdown()

-    async def list_vector_stores(self) -> list[VectorStore]:
-        return [v.vector_store for v in self.cache.values()]
+    async def list_vector_dbs(self) -> list[VectorDB]:
+        return [v.vector_db for v in self.cache.values()]

-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        index = await SQLiteVecIndex.create(
-            vector_store.embedding_dimension, self.config.db_path, vector_store.identifier
-        )
-        self.cache[vector_store.identifier] = VectorStoreWithIndex(vector_store, index, self.inference_api)
+    async def register_vector_db(self, vector_db: VectorDB) -> None:
+        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
+        self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex | None:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
+        if vector_db_id in self.cache:
+            return self.cache[vector_db_id]

-        if self.vector_store_table is None:
-            raise VectorStoreNotFoundError(vector_store_id)
+        if self.vector_db_store is None:
+            raise VectorStoreNotFoundError(vector_db_id)

-        vector_store = self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
+        vector_db = self.vector_db_store.get_vector_db(vector_db_id)
+        if not vector_db:
+            raise VectorStoreNotFoundError(vector_db_id)

-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
+        index = VectorDBWithIndex(
+            vector_db=vector_db,
            index=SQLiteVecIndex(
-                dimension=vector_store.embedding_dimension,
+                dimension=vector_db.embedding_dimension,
                db_path=self.config.db_path,
-                bank_id=vector_store.identifier,
+                bank_id=vector_db.identifier,
                kvstore=self.kvstore,
            ),
            inference_api=self.inference_api,
        )
-        self.cache[vector_store_id] = index
+        self.cache[vector_db_id] = index
        return index

-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id not in self.cache:
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        if vector_db_id not in self.cache:
+            logger.warning(f"Vector DB {vector_db_id} not found")
            return
-        await self.cache[vector_store_id].index.delete()
-        del self.cache[vector_store_id]
+        await self.cache[vector_db_id].index.delete()
+        del self.cache[vector_db_id]

    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
            raise VectorStoreNotFoundError(vector_db_id)
-        # The VectorStoreWithIndex helper is expected to compute embeddings via the inference_api
+        # The VectorDBWithIndex helper is expected to compute embeddings via the inference_api
        # and then call our index's add_chunks.
        await index.insert_chunks(chunks)

    async def query_chunks(
        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
            raise VectorStoreNotFoundError(vector_db_id)
        return await index.query_chunks(query, params)

    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
        """Delete chunks from a sqlite_vec index."""
-        index = await self._get_and_cache_vector_store_index(store_id)
+        index = await self._get_and_cache_vector_db_index(store_id)
        if not index:
            raise VectorStoreNotFoundError(store_id)

--- a/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ b/llama_stack/providers/remote/datasetio/nvidia/README.md
@ -20,7 +20,7 @@ This provider enables dataset management using NVIDIA's NeMo Customizer service.
 Build the NVIDIA environment:

 ```bash
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
+llama stack build --distro nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -18,7 +18,7 @@ This provider enables running inference using NVIDIA NIM.
 Build the NVIDIA environment:

 ```bash
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
+llama stack build --distro nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/inference/nvidia/init.py
+++ b/llama_stack/providers/remote/inference/nvidia/init.py
@ -10,7 +10,7 @@ from .config import NVIDIAConfig


 async def get_adapter_impl(config: NVIDIAConfig, _deps) -> Inference:
-    # import dynamically so `llama stack list-deps` does not fail due to missing dependencies
+    # import dynamically so `llama stack build` does not fail due to missing dependencies
    from .nvidia import NVIDIAInferenceAdapter

    if not isinstance(config, NVIDIAConfig):
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -22,11 +22,11 @@ from llama_stack.apis.inference.inference import (
 )
 from llama_stack.apis.models import Model
 from llama_stack.apis.models.models import ModelType
-from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
+from llama_stack.providers.utils.telemetry.tracing import get_current_span

 logger = get_logger(name=__name__, category="providers::remote::watsonx")

--- a/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/llama_stack/providers/remote/post_training/nvidia/README.md
@ -22,7 +22,7 @@ This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service
 Build the NVIDIA environment:

 ```bash
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
+llama stack build --distro nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/llama_stack/providers/remote/safety/nvidia/README.md
@ -19,7 +19,7 @@ This provider enables safety checks and guardrails for LLM interactions using NV
 Build the NVIDIA environment:

 ```bash
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
+llama stack build --distro nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@ -66,7 +66,7 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
        self.shield = NeMoGuardrails(self.config, shield.shield_id)
        return await self.shield.run(messages)

-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        raise NotImplementedError("NVIDIA safety provider currently does not implement run_moderation")


--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -13,15 +13,15 @@ from numpy.typing import NDArray

 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InterleavedContent
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
+from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
+from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex

 from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig

@ -30,7 +30,7 @@ log = get_logger(name=__name__, category="vector_io::chroma")
 ChromaClientType = chromadb.api.AsyncClientAPI | chromadb.api.ClientAPI

 VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:chroma:{VERSION}::"
+VECTOR_DBS_PREFIX = f"vector_dbs:chroma:{VERSION}::"
 VECTOR_INDEX_PREFIX = f"vector_index:chroma:{VERSION}::"
 OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:chroma:{VERSION}::"
 OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:chroma:{VERSION}::"
@ -114,7 +114,7 @@ class ChromaIndex(EmbeddingIndex):
        raise NotImplementedError("Hybrid search is not supported in Chroma")


-class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
+class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
        self,
        config: RemoteChromaVectorIOConfig | InlineChromaVectorIOConfig,
@ -127,11 +127,11 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        self.inference_api = inference_api
        self.client = None
        self.cache = {}
-        self.vector_store_table = None
+        self.vector_db_store = None

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.persistence)
-        self.vector_store_table = self.kvstore
+        self.vector_db_store = self.kvstore

        if isinstance(self.config, RemoteChromaVectorIOConfig):
            log.info(f"Connecting to Chroma server at: {self.config.url}")
@ -151,26 +151,26 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        # Clean up mixin resources (file batch tasks)
        await super().shutdown()

-    async def register_vector_store(self, vector_store: VectorStore) -> None:
+    async def register_vector_db(self, vector_db: VectorDB) -> None:
        collection = await maybe_await(
            self.client.get_or_create_collection(
-                name=vector_store.identifier, metadata={"vector_store": vector_store.model_dump_json()}
+                name=vector_db.identifier, metadata={"vector_db": vector_db.model_dump_json()}
            )
        )
-        self.cache[vector_store.identifier] = VectorStoreWithIndex(
-            vector_store, ChromaIndex(self.client, collection), self.inference_api
+        self.cache[vector_db.identifier] = VectorDBWithIndex(
+            vector_db, ChromaIndex(self.client, collection), self.inference_api
        )

-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id not in self.cache:
-            log.warning(f"Vector DB {vector_store_id} not found")
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        if vector_db_id not in self.cache:
+            log.warning(f"Vector DB {vector_db_id} not found")
            return

-        await self.cache[vector_store_id].index.delete()
-        del self.cache[vector_store_id]
+        await self.cache[vector_db_id].index.delete()
+        del self.cache[vector_db_id]

    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if index is None:
            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")

@ -179,30 +179,30 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
    async def query_chunks(
        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)

        if index is None:
            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")

        return await index.query_chunks(query, params)

-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex:
+        if vector_db_id in self.cache:
+            return self.cache[vector_db_id]

-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise ValueError(f"Vector DB {vector_store_id} not found in Llama Stack")
-        collection = await maybe_await(self.client.get_collection(vector_store_id))
+        vector_db = await self.vector_db_store.get_vector_db(vector_db_id)
+        if not vector_db:
+            raise ValueError(f"Vector DB {vector_db_id} not found in Llama Stack")
+        collection = await maybe_await(self.client.get_collection(vector_db_id))
        if not collection:
-            raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")
-        index = VectorStoreWithIndex(vector_store, ChromaIndex(self.client, collection), self.inference_api)
-        self.cache[vector_store_id] = index
+            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
+        index = VectorDBWithIndex(vector_db, ChromaIndex(self.client, collection), self.inference_api)
+        self.cache[vector_db_id] = index
        return index

    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
        """Delete chunks from a Chroma vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
+        index = await self._get_and_cache_vector_db_index(store_id)
        if not index:
            raise ValueError(f"Vector DB {store_id} not found")

--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -14,10 +14,10 @@ from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusC
 from llama_stack.apis.common.errors import VectorStoreNotFoundError
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InterleavedContent
+from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
+from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
@ -26,7 +26,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_WEIGHTED,
    ChunkForDeletion,
    EmbeddingIndex,
-    VectorStoreWithIndex,
+    VectorDBWithIndex,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import sanitize_collection_name

@ -35,7 +35,7 @@ from .config import MilvusVectorIOConfig as RemoteMilvusVectorIOConfig
 logger = get_logger(name=__name__, category="vector_io::milvus")

 VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:milvus:{VERSION}::"
+VECTOR_DBS_PREFIX = f"vector_dbs:milvus:{VERSION}::"
 VECTOR_INDEX_PREFIX = f"vector_index:milvus:{VERSION}::"
 OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:milvus:{VERSION}::"
 OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:milvus:{VERSION}::"
@ -261,7 +261,7 @@ class MilvusIndex(EmbeddingIndex):
            raise


-class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
+class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
        self,
        config: RemoteMilvusVectorIOConfig | InlineMilvusVectorIOConfig,
@ -273,28 +273,28 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        self.cache = {}
        self.client = None
        self.inference_api = inference_api
-        self.vector_store_table = None
+        self.vector_db_store = None
        self.metadata_collection_name = "openai_vector_stores_metadata"

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.persistence)
        start_key = VECTOR_DBS_PREFIX
        end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_stores = await self.kvstore.values_in_range(start_key, end_key)
+        stored_vector_dbs = await self.kvstore.values_in_range(start_key, end_key)

-        for vector_store_data in stored_vector_stores:
-            vector_store = VectorStore.model_validate_json(vector_store_data)
-            index = VectorStoreWithIndex(
-                vector_store,
+        for vector_db_data in stored_vector_dbs:
+            vector_db = VectorDB.model_validate_json(vector_db_data)
+            index = VectorDBWithIndex(
+                vector_db,
                index=MilvusIndex(
                    client=self.client,
-                    collection_name=vector_store.identifier,
+                    collection_name=vector_db.identifier,
                    consistency_level=self.config.consistency_level,
                    kvstore=self.kvstore,
                ),
                inference_api=self.inference_api,
            )
-            self.cache[vector_store.identifier] = index
+            self.cache[vector_db.identifier] = index
        if isinstance(self.config, RemoteMilvusVectorIOConfig):
            logger.info(f"Connecting to Milvus server at {self.config.uri}")
            self.client = MilvusClient(**self.config.model_dump(exclude_none=True))
@ -311,45 +311,45 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        # Clean up mixin resources (file batch tasks)
        await super().shutdown()

-    async def register_vector_store(self, vector_store: VectorStore) -> None:
+    async def register_vector_db(self, vector_db: VectorDB) -> None:
        if isinstance(self.config, RemoteMilvusVectorIOConfig):
            consistency_level = self.config.consistency_level
        else:
            consistency_level = "Strong"
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=MilvusIndex(self.client, vector_store.identifier, consistency_level=consistency_level),
+        index = VectorDBWithIndex(
+            vector_db=vector_db,
+            index=MilvusIndex(self.client, vector_db.identifier, consistency_level=consistency_level),
            inference_api=self.inference_api,
        )

-        self.cache[vector_store.identifier] = index
+        self.cache[vector_db.identifier] = index

-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex | None:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
+        if vector_db_id in self.cache:
+            return self.cache[vector_db_id]

-        if self.vector_store_table is None:
-            raise VectorStoreNotFoundError(vector_store_id)
+        if self.vector_db_store is None:
+            raise VectorStoreNotFoundError(vector_db_id)

-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
+        vector_db = await self.vector_db_store.get_vector_db(vector_db_id)
+        if not vector_db:
+            raise VectorStoreNotFoundError(vector_db_id)

-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=MilvusIndex(client=self.client, collection_name=vector_store.identifier, kvstore=self.kvstore),
+        index = VectorDBWithIndex(
+            vector_db=vector_db,
+            index=MilvusIndex(client=self.client, collection_name=vector_db.identifier, kvstore=self.kvstore),
            inference_api=self.inference_api,
        )
-        self.cache[vector_store_id] = index
+        self.cache[vector_db_id] = index
        return index

-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id in self.cache:
-            await self.cache[vector_store_id].index.delete()
-            del self.cache[vector_store_id]
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        if vector_db_id in self.cache:
+            await self.cache[vector_db_id].index.delete()
+            del self.cache[vector_db_id]

    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
            raise VectorStoreNotFoundError(vector_db_id)

@ -358,14 +358,14 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
    async def query_chunks(
        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
            raise VectorStoreNotFoundError(vector_db_id)
        return await index.query_chunks(query, params)

    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
        """Delete a chunk from a milvus vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
+        index = await self._get_and_cache_vector_db_index(store_id)
        if not index:
            raise VectorStoreNotFoundError(store_id)

--- a/Show more
+++ b/Show more