Merge 9aef325934 into sapling-pr-archive-ehhuang

2025-12-04 02:03:44 +00:00 · 2025-10-27 15:32:50 -07:00 · 2025-10-27 15:32:50 -07:00 · e9a8967ed5
commit e9a8967ed5
parent a0b6c424de 9aef325934
41 changed files with 1280 additions and 197 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -9862,7 +9862,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -9873,7 +9873,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -10044,7 +10044,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -10057,7 +10057,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -10281,7 +10281,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -10300,13 +10300,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -10326,7 +10326,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
@ -11844,7 +11844,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -11857,7 +11857,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -72,14 +72,14 @@ description: |
  Example with hybrid search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
  )

  # Using RRF ranker
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={
          "mode": "hybrid",
@ -91,7 +91,7 @@ description: |

  # Using weighted ranker
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={
          "mode": "hybrid",
@ -105,7 +105,7 @@ description: |
  Example with explicit vector search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
  )
@ -114,7 +114,7 @@ description: |
  Example with keyword search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
  )
@ -277,14 +277,14 @@ The SQLite-vec provider supports three search modes:
 Example with hybrid search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
 )

 # Using RRF ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -296,7 +296,7 @@ response = await vector_io.query_chunks(

 # Using weighted ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -310,7 +310,7 @@ response = await vector_io.query_chunks(
 Example with explicit vector search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
 )
@ -319,7 +319,7 @@ response = await vector_io.query_chunks(
 Example with keyword search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
 )
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -4390,7 +4390,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -4404,7 +4404,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -3252,7 +3252,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -3265,7 +3265,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -2865,7 +2865,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -2879,7 +2879,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -2085,7 +2085,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -2098,7 +2098,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -11412,7 +11412,7 @@
                        },
                        "description": "List of documents to index in the RAG system"
                    },
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "ID of the vector database to store the document embeddings"
                    },
@ -11424,7 +11424,7 @@
                "additionalProperties": false,
                "required": [
                    "documents",
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunk_size_in_tokens"
                ],
                "title": "InsertRequest"
@ -11615,7 +11615,7 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The query content to search for in the indexed documents"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
@ -11630,7 +11630,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
-                    "vector_db_ids"
+                    "vector_store_ids"
                ],
                "title": "QueryRequest"
            },
@ -11923,7 +11923,7 @@
            "InsertChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to insert the chunks into."
                    },
@ -11941,7 +11941,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunks"
                ],
                "title": "InsertChunksRequest"
@ -11949,7 +11949,7 @@
            "QueryChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to query."
                    },
@ -11986,7 +11986,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "query"
                ],
                "title": "QueryChunksRequest"
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -8649,7 +8649,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -8660,7 +8660,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -8831,7 +8831,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -8844,7 +8844,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -9068,7 +9068,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -9087,13 +9087,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -9113,7 +9113,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -13084,7 +13084,7 @@
                        },
                        "description": "List of documents to index in the RAG system"
                    },
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "ID of the vector database to store the document embeddings"
                    },
@ -13096,7 +13096,7 @@
                "additionalProperties": false,
                "required": [
                    "documents",
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunk_size_in_tokens"
                ],
                "title": "InsertRequest"
@ -13287,7 +13287,7 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The query content to search for in the indexed documents"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
@ -13302,7 +13302,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
-                    "vector_db_ids"
+                    "vector_store_ids"
                ],
                "title": "QueryRequest"
            },
@ -13595,7 +13595,7 @@
            "InsertChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to insert the chunks into."
                    },
@ -13613,7 +13613,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunks"
                ],
                "title": "InsertChunksRequest"
@ -13621,7 +13621,7 @@
            "QueryChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to query."
                    },
@ -13658,7 +13658,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "query"
                ],
                "title": "QueryChunksRequest"
@ -15719,7 +15719,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -15733,7 +15733,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -9862,7 +9862,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -9873,7 +9873,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -10044,7 +10044,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -10057,7 +10057,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -10281,7 +10281,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -10300,13 +10300,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -10326,7 +10326,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
@ -11844,7 +11844,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -11857,7 +11857,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -30,8 +30,10 @@ materialize_telemetry_configs() {
  local otel_cfg="${dest}/otel-collector-config.yaml"
  local prom_cfg="${dest}/prometheus.yml"
  local graf_cfg="${dest}/grafana-datasources.yaml"
+  local graf_dash_cfg="${dest}/grafana-dashboards.yaml"
+  local dash_json="${dest}/llama-stack-dashboard.json"

-  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do
+  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do
    if [ -e "$asset" ]; then
      die "Telemetry asset ${asset} already exists; refusing to overwrite"
    fi
@ -103,6 +105,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
+    uid: prometheus
    isDefault: true
    editable: true

@ -112,6 +115,224 @@ datasources:
    url: http://jaeger:16686
    editable: true
 EOF
+
+  cat <<'EOF' > "$graf_dash_cfg"
+apiVersion: 1
+
+providers:
+  - name: 'Llama Stack'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
+EOF
+
+  # Copy the dashboard JSON inline to avoid line-length issues
+  cat > "$dash_json" <<'DASHBOARD_JSON'
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{"color": "green", "value": null}]
+          }
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+      "id": 1,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "prometheus"},
+          "expr": "llama_stack_completion_tokens_total",
+          "legendFormat": "{{model_id}} ({{provider_id}})",
+          "refId": "A"
+        }
+      ],
+      "title": "Completion Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+      "id": 2,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
+      ],
+      "title": "Prompt & Total Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "ms"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+      "id": 3,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
+      ],
+      "title": "HTTP Request Duration (p95, p99)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "textMode": "auto"
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
+      ],
+      "title": "Total Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "textMode": "auto"
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
+      ],
+      "title": "Active Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "reqps"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+      "id": 6,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
+      ],
+      "title": "Request Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "Bps"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+      "id": 7,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
+      ],
+      "title": "Request/Response Sizes",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "tags": ["llama-stack"],
+  "templating": {"list": []},
+  "time": {"from": "now-15m", "to": "now"},
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Llama Stack Metrics",
+  "uid": "llama-stack-metrics",
+  "version": 0,
+  "weekStart": ""
+}
+DASHBOARD_JSON
 }

 # Cleanup function to remove temporary files
@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
    -e GF_SECURITY_ADMIN_PASSWORD=admin \
    -e GF_USERS_ALLOW_SIGN_UP=false \
    -v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+    -v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
+    -v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
    docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then
    die "Grafana startup failed"
  fi
--- a/scripts/telemetry/grafana-dashboards.yaml
+++ b/scripts/telemetry/grafana-dashboards.yaml
@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: 'Llama Stack'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
--- a/scripts/telemetry/grafana-datasources.yaml
+++ b/scripts/telemetry/grafana-datasources.yaml
@ -5,6 +5,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
+    uid: prometheus
    isDefault: true
    editable: true

--- a/scripts/telemetry/llama-stack-dashboard.json
+++ b/scripts/telemetry/llama-stack-dashboard.json
@ -0,0 +1,457 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_completion_tokens_total",
+          "legendFormat": "{{model_id}} ({{provider_id}})",
+          "refId": "A"
+        }
+      ],
+      "title": "Completion Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_prompt_tokens_total",
+          "legendFormat": "Prompt - {{model_id}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_tokens_total",
+          "legendFormat": "Total - {{model_id}}",
+          "refId": "B"
+        }
+      ],
+      "title": "Prompt & Total Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ms"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
+          "legendFormat": "p99",
+          "refId": "B"
+        }
+      ],
+      "title": "HTTP Request Duration (p95, p99)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 12,
+        "y": 8
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(llama_stack_http_server_duration_milliseconds_count)",
+          "refId": "A"
+        }
+      ],
+      "title": "Total Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 18,
+        "y": 8
+      },
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(llama_stack_http_server_active_requests)",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])",
+          "legendFormat": "{{http_target}} - {{http_status_code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Request Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "Bps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])",
+          "legendFormat": "Request",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])",
+          "legendFormat": "Response",
+          "refId": "B"
+        }
+      ],
+      "title": "Request/Response Sizes",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "tags": [
+    "llama-stack"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Llama Stack Metrics",
+  "uid": "llama-stack-metrics",
+  "version": 0,
+  "weekStart": ""
+}
--- a/scripts/telemetry/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@ -135,6 +135,8 @@ $CONTAINER_RUNTIME run -d --name grafana \
  -e GF_SECURITY_ADMIN_PASSWORD=admin \
  -e GF_USERS_ALLOW_SIGN_UP=false \
  -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+  -v "$SCRIPT_DIR/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
+  -v "$SCRIPT_DIR/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
  docker.io/grafana/grafana:11.0.0

 # Wait for services to start
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -149,13 +149,13 @@ class ShieldCallStep(StepCommon):
 class MemoryRetrievalStep(StepCommon):
    """A memory retrieval step in an agent turn.

-    :param vector_db_ids: The IDs of the vector databases to retrieve context from.
+    :param vector_store_ids: The IDs of the vector databases to retrieve context from.
    :param inserted_context: The context retrieved from the vector databases.
    """

    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
    # TODO: should this be List[str]?
-    vector_db_ids: str
+    vector_store_ids: str
    inserted_context: InterleavedContent


--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -21,8 +21,8 @@ from typing_extensions import TypedDict
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
-from llama_stack.apis.telemetry import MetricResponseMixin
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.core.telemetry.telemetry import MetricResponseMixin
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
--- a/src/llama_stack/apis/tools/rag_tool.py
+++ b/src/llama_stack/apis/tools/rag_tool.py
@ -190,13 +190,13 @@ class RAGToolRuntime(Protocol):
    async def insert(
        self,
        documents: list[RAGDocument],
-        vector_db_id: str,
+        vector_store_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
        """Index documents so they can be used by the RAG system.

        :param documents: List of documents to index in the RAG system
-        :param vector_db_id: ID of the vector database to store the document embeddings
+        :param vector_store_id: ID of the vector database to store the document embeddings
        :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
        """
        ...
@ -205,13 +205,13 @@ class RAGToolRuntime(Protocol):
    async def query(
        self,
        content: InterleavedContent,
-        vector_db_ids: list[str],
+        vector_store_ids: list[str],
        query_config: RAGQueryConfig | None = None,
    ) -> RAGQueryResult:
        """Query the RAG system for context; typically invoked by the agent.

        :param content: The query content to search for in the indexed documents
-        :param vector_db_ids: List of vector database IDs to search within
+        :param vector_store_ids: List of vector database IDs to search within
        :param query_config: (Optional) Configuration parameters for the query operation
        :returns: RAGQueryResult containing the retrieved content and metadata
        """
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -529,17 +529,17 @@ class VectorIO(Protocol):

    # this will just block now until chunks are inserted, but it should
    # probably return a Job instance which can be polled for completion
-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
+    # TODO: rename vector_store_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
    async def insert_chunks(
        self,
-        vector_db_id: str,
+        vector_store_id: str,
        chunks: list[Chunk],
        ttl_seconds: int | None = None,
    ) -> None:
        """Insert chunks into a vector database.

-        :param vector_db_id: The identifier of the vector database to insert the chunks into.
+        :param vector_store_id: The identifier of the vector database to insert the chunks into.
        :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
            `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
            If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
@ -548,17 +548,17 @@ class VectorIO(Protocol):
        """
        ...

-    # TODO: rename vector_db_id to vector_store_id once Stainless is working
+    # TODO: rename vector_store_id to vector_store_id once Stainless is working
    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
    async def query_chunks(
        self,
-        vector_db_id: str,
+        vector_store_id: str,
        query: InterleavedContent,
        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        """Query chunks from a vector database.

-        :param vector_db_id: The identifier of the vector database to query.
+        :param vector_store_id: The identifier of the vector database to query.
        :param query: The query to search for.
        :param params: The parameters of the query.
        :returns: A QueryChunksResponse.
--- a/src/llama_stack/core/conversations/conversations.py
+++ b/src/llama_stack/core/conversations/conversations.py
@ -312,3 +312,6 @@ class ConversationServiceImpl(Conversations):

        logger.debug(f"Deleted item {item_id} from conversation {conversation_id}")
        return ConversationItemDeletedResource(id=item_id)
+
+    async def shutdown(self) -> None:
+        pass
--- a/src/llama_stack/core/prompts/prompts.py
+++ b/src/llama_stack/core/prompts/prompts.py
@ -230,3 +230,6 @@ class PromptServiceImpl(Prompts):
        await self.kvstore.set(default_key, str(version))

        return self._deserialize_prompt(data)
+
+    async def shutdown(self) -> None:
+        pass
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -53,7 +53,7 @@ from llama_stack.apis.inference.inference import (
    OpenAIChatCompletionContentPartTextParam,
 )
 from llama_stack.apis.models import Model, ModelType
-from llama_stack.apis.telemetry import MetricEvent, MetricInResponse
+from llama_stack.core.telemetry.telemetry import MetricEvent, MetricInResponse
 from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -73,27 +73,27 @@ class VectorIORouter(VectorIO):

    async def insert_chunks(
        self,
-        vector_db_id: str,
+        vector_store_id: str,
        chunks: list[Chunk],
        ttl_seconds: int | None = None,
    ) -> None:
        doc_ids = [chunk.document_id for chunk in chunks[:3]]
        logger.debug(
-            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, "
+            f"VectorIORouter.insert_chunks: {vector_store_id}, {len(chunks)} chunks, "
            f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}"
        )
-        provider = await self.routing_table.get_provider_impl(vector_db_id)
-        return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds)
+        provider = await self.routing_table.get_provider_impl(vector_store_id)
+        return await provider.insert_chunks(vector_store_id, chunks, ttl_seconds)

    async def query_chunks(
        self,
-        vector_db_id: str,
+        vector_store_id: str,
        query: InterleavedContent,
        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
-        logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
-        provider = await self.routing_table.get_provider_impl(vector_db_id)
-        return await provider.query_chunks(vector_db_id, query, params)
+        logger.debug(f"VectorIORouter.query_chunks: {vector_store_id}")
+        provider = await self.routing_table.get_provider_impl(vector_store_id)
+        return await provider.query_chunks(vector_store_id, query, params)

    # OpenAI Vector Stores API endpoints
    async def openai_create_vector_store(
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -31,7 +31,6 @@ from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
 from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
-from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
@ -67,7 +66,6 @@ class LlamaStack(
    Safety,
    SyntheticDataGeneration,
    Datasets,
-    Telemetry,
    PostTraining,
    VectorIO,
    Eval,
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -6,7 +6,13 @@

 import os
 import threading
-from typing import Any
+from datetime import datetime
+from enum import Enum
+from typing import (
+    Annotated,
+    Any,
+    Literal,
+)

 from opentelemetry import metrics, trace
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
@ -16,21 +22,399 @@ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+from pydantic import BaseModel, Field

-from llama_stack.apis.telemetry import (
-    Event,
-    MetricEvent,
-    SpanEndPayload,
-    SpanStartPayload,
-    SpanStatus,
-    StructuredLogEvent,
-    UnstructuredLogEvent,
-)
-from llama_stack.apis.telemetry import (
-    Telemetry as TelemetryBase,
-)
-from llama_stack.core.telemetry.tracing import ROOT_SPAN_MARKERS
 from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import Primitive
+from llama_stack.schema_utils import json_schema_type, register_schema
+
+ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
+
+
+@json_schema_type
+class SpanStatus(Enum):
+    """The status of a span indicating whether it completed successfully or with an error.
+    :cvar OK: Span completed successfully without errors
+    :cvar ERROR: Span completed with an error or failure
+    """
+
+    OK = "ok"
+    ERROR = "error"
+
+
+@json_schema_type
+class Span(BaseModel):
+    """A span representing a single operation within a trace.
+    :param span_id: Unique identifier for the span
+    :param trace_id: Unique identifier for the trace this span belongs to
+    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
+    :param name: Human-readable name describing the operation this span represents
+    :param start_time: Timestamp when the operation began
+    :param end_time: (Optional) Timestamp when the operation finished, if completed
+    :param attributes: (Optional) Key-value pairs containing additional metadata about the span
+    """
+
+    span_id: str
+    trace_id: str
+    parent_span_id: str | None = None
+    name: str
+    start_time: datetime
+    end_time: datetime | None = None
+    attributes: dict[str, Any] | None = Field(default_factory=lambda: {})
+
+    def set_attribute(self, key: str, value: Any):
+        if self.attributes is None:
+            self.attributes = {}
+        self.attributes[key] = value
+
+
+@json_schema_type
+class Trace(BaseModel):
+    """A trace representing the complete execution path of a request across multiple operations.
+    :param trace_id: Unique identifier for the trace
+    :param root_span_id: Unique identifier for the root span that started this trace
+    :param start_time: Timestamp when the trace began
+    :param end_time: (Optional) Timestamp when the trace finished, if completed
+    """
+
+    trace_id: str
+    root_span_id: str
+    start_time: datetime
+    end_time: datetime | None = None
+
+
+@json_schema_type
+class EventType(Enum):
+    """The type of telemetry event being logged.
+    :cvar UNSTRUCTURED_LOG: A simple log message with severity level
+    :cvar STRUCTURED_LOG: A structured log event with typed payload data
+    :cvar METRIC: A metric measurement with value and unit
+    """
+
+    UNSTRUCTURED_LOG = "unstructured_log"
+    STRUCTURED_LOG = "structured_log"
+    METRIC = "metric"
+
+
+@json_schema_type
+class LogSeverity(Enum):
+    """The severity level of a log message.
+    :cvar VERBOSE: Detailed diagnostic information for troubleshooting
+    :cvar DEBUG: Debug information useful during development
+    :cvar INFO: General informational messages about normal operation
+    :cvar WARN: Warning messages about potentially problematic situations
+    :cvar ERROR: Error messages indicating failures that don't stop execution
+    :cvar CRITICAL: Critical error messages indicating severe failures
+    """
+
+    VERBOSE = "verbose"
+    DEBUG = "debug"
+    INFO = "info"
+    WARN = "warn"
+    ERROR = "error"
+    CRITICAL = "critical"
+
+
+class EventCommon(BaseModel):
+    """Common fields shared by all telemetry events.
+    :param trace_id: Unique identifier for the trace this event belongs to
+    :param span_id: Unique identifier for the span this event belongs to
+    :param timestamp: Timestamp when the event occurred
+    :param attributes: (Optional) Key-value pairs containing additional metadata about the event
+    """
+
+    trace_id: str
+    span_id: str
+    timestamp: datetime
+    attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {})
+
+
+@json_schema_type
+class UnstructuredLogEvent(EventCommon):
+    """An unstructured log event containing a simple text message.
+    :param type: Event type identifier set to UNSTRUCTURED_LOG
+    :param message: The log message text
+    :param severity: The severity level of the log message
+    """
+
+    type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
+    message: str
+    severity: LogSeverity
+
+
+@json_schema_type
+class MetricEvent(EventCommon):
+    """A metric event containing a measured value.
+    :param type: Event type identifier set to METRIC
+    :param metric: The name of the metric being measured
+    :param value: The numeric value of the metric measurement
+    :param unit: The unit of measurement for the metric value
+    """
+
+    type: Literal[EventType.METRIC] = EventType.METRIC
+    metric: str  # this would be an enum
+    value: int | float
+    unit: str
+
+
+@json_schema_type
+class MetricInResponse(BaseModel):
+    """A metric value included in API responses.
+    :param metric: The name of the metric
+    :param value: The numeric value of the metric
+    :param unit: (Optional) The unit of measurement for the metric value
+    """
+
+    metric: str
+    value: int | float
+    unit: str | None = None
+
+
+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be included with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+class MetricResponseMixin(BaseModel):
+    """Mixin class for API responses that can include metrics.
+    :param metrics: (Optional) List of metrics associated with the API response
+    """
+
+    metrics: list[MetricInResponse] | None = None
+
+
+@json_schema_type
+class StructuredLogType(Enum):
+    """The type of structured log event payload.
+    :cvar SPAN_START: Event indicating the start of a new span
+    :cvar SPAN_END: Event indicating the completion of a span
+    """
+
+    SPAN_START = "span_start"
+    SPAN_END = "span_end"
+
+
+@json_schema_type
+class SpanStartPayload(BaseModel):
+    """Payload for a span start event.
+    :param type: Payload type identifier set to SPAN_START
+    :param name: Human-readable name describing the operation this span represents
+    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
+    """
+
+    type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
+    name: str
+    parent_span_id: str | None = None
+
+
+@json_schema_type
+class SpanEndPayload(BaseModel):
+    """Payload for a span end event.
+    :param type: Payload type identifier set to SPAN_END
+    :param status: The final status of the span indicating success or failure
+    """
+
+    type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
+    status: SpanStatus
+
+
+StructuredLogPayload = Annotated[
+    SpanStartPayload | SpanEndPayload,
+    Field(discriminator="type"),
+]
+register_schema(StructuredLogPayload, name="StructuredLogPayload")
+
+
+@json_schema_type
+class StructuredLogEvent(EventCommon):
+    """A structured log event containing typed payload data.
+    :param type: Event type identifier set to STRUCTURED_LOG
+    :param payload: The structured payload data for the log event
+    """
+
+    type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
+    payload: StructuredLogPayload
+
+
+Event = Annotated[
+    UnstructuredLogEvent | MetricEvent | StructuredLogEvent,
+    Field(discriminator="type"),
+]
+register_schema(Event, name="Event")
+
+
+@json_schema_type
+class EvalTrace(BaseModel):
+    """A trace record for evaluation purposes.
+    :param session_id: Unique identifier for the evaluation session
+    :param step: The evaluation step or phase identifier
+    :param input: The input data for the evaluation
+    :param output: The actual output produced during evaluation
+    :param expected_output: The expected output for comparison during evaluation
+    """
+
+    session_id: str
+    step: str
+    input: str
+    output: str
+    expected_output: str
+
+
+@json_schema_type
+class SpanWithStatus(Span):
+    """A span that includes status information.
+    :param status: (Optional) The current status of the span
+    """
+
+    status: SpanStatus | None = None
+
+
+@json_schema_type
+class QueryConditionOp(Enum):
+    """Comparison operators for query conditions.
+    :cvar EQ: Equal to comparison
+    :cvar NE: Not equal to comparison
+    :cvar GT: Greater than comparison
+    :cvar LT: Less than comparison
+    """
+
+    EQ = "eq"
+    NE = "ne"
+    GT = "gt"
+    LT = "lt"
+
+
+@json_schema_type
+class QueryCondition(BaseModel):
+    """A condition for filtering query results.
+    :param key: The attribute key to filter on
+    :param op: The comparison operator to apply
+    :param value: The value to compare against
+    """
+
+    key: str
+    op: QueryConditionOp
+    value: Any
+
+
+class QueryTracesResponse(BaseModel):
+    """Response containing a list of traces.
+    :param data: List of traces matching the query criteria
+    """
+
+    data: list[Trace]
+
+
+class QuerySpansResponse(BaseModel):
+    """Response containing a list of spans.
+    :param data: List of spans matching the query criteria
+    """
+
+    data: list[Span]
+
+
+class QuerySpanTreeResponse(BaseModel):
+    """Response containing a tree structure of spans.
+    :param data: Dictionary mapping span IDs to spans with status information
+    """
+
+    data: dict[str, SpanWithStatus]
+
+
+class MetricQueryType(Enum):
+    """The type of metric query to perform.
+    :cvar RANGE: Query metrics over a time range
+    :cvar INSTANT: Query metrics at a specific point in time
+    """
+
+    RANGE = "range"
+    INSTANT = "instant"
+
+
+class MetricLabelOperator(Enum):
+    """Operators for matching metric labels.
+    :cvar EQUALS: Label value must equal the specified value
+    :cvar NOT_EQUALS: Label value must not equal the specified value
+    :cvar REGEX_MATCH: Label value must match the specified regular expression
+    :cvar REGEX_NOT_MATCH: Label value must not match the specified regular expression
+    """
+
+    EQUALS = "="
+    NOT_EQUALS = "!="
+    REGEX_MATCH = "=~"
+    REGEX_NOT_MATCH = "!~"
+
+
+class MetricLabelMatcher(BaseModel):
+    """A matcher for filtering metrics by label values.
+    :param name: The name of the label to match
+    :param value: The value to match against
+    :param operator: The comparison operator to use for matching
+    """
+
+    name: str
+    value: str
+    operator: MetricLabelOperator = MetricLabelOperator.EQUALS
+
+
+@json_schema_type
+class MetricLabel(BaseModel):
+    """A label associated with a metric.
+    :param name: The name of the label
+    :param value: The value of the label
+    """
+
+    name: str
+    value: str
+
+
+@json_schema_type
+class MetricDataPoint(BaseModel):
+    """A single data point in a metric time series.
+    :param timestamp: Unix timestamp when the metric value was recorded
+    :param value: The numeric value of the metric at this timestamp
+    """
+
+    timestamp: int
+    value: float
+    unit: str
+
+
+@json_schema_type
+class MetricSeries(BaseModel):
+    """A time series of metric data points.
+    :param metric: The name of the metric
+    :param labels: List of labels associated with this metric series
+    :param values: List of data points in chronological order
+    """
+
+    metric: str
+    labels: list[MetricLabel]
+    values: list[MetricDataPoint]
+
+
+class QueryMetricsResponse(BaseModel):
+    """Response containing metric time series data.
+    :param data: List of metric series matching the query criteria
+    """
+
+    data: list[MetricSeries]
+

 _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "active_spans": {},
@ -49,7 +433,7 @@ def is_tracing_enabled(tracer):
        return span.is_recording()


-class Telemetry(TelemetryBase):
+class Telemetry:
    def __init__(self) -> None:
        self.meter = None

--- a/src/llama_stack/core/telemetry/tracing.py
+++ b/src/llama_stack/core/telemetry/tracing.py
@ -17,7 +17,8 @@ from datetime import UTC, datetime
 from functools import wraps
 from typing import Any, Self

-from llama_stack.apis.telemetry import (
+from llama_stack.core.telemetry.telemetry import (
+    ROOT_SPAN_MARKERS,
    Event,
    LogSeverity,
    Span,
@ -47,7 +48,6 @@ if not _fallback_logger.handlers:
 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000

-ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
 # The logical root span may not be visible to this process if a parent context
 # is passed in. The local root span is the first local span in a trace.
 LOCAL_ROOT_SPAN_MARKER = "__local_root_span__"
--- a/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -488,13 +488,13 @@ class ChatAgent(ShieldRunnerMixin):

        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
-        if session_info and session_info.vector_db_id:
+        if session_info and session_info.vector_store_id:
            for tool_name in self.tool_name_to_args.keys():
                if tool_name == MEMORY_QUERY_TOOL:
-                    if "vector_db_ids" not in self.tool_name_to_args[tool_name]:
-                        self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id]
+                    if "vector_store_ids" not in self.tool_name_to_args[tool_name]:
+                        self.tool_name_to_args[tool_name]["vector_store_ids"] = [session_info.vector_store_id]
                    else:
-                        self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id)
+                        self.tool_name_to_args[tool_name]["vector_store_ids"].append(session_info.vector_store_id)

        output_attachments = []

--- a/src/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/persistence.py
@ -22,7 +22,7 @@ log = get_logger(name=__name__, category="agents::meta_reference")

 class AgentSessionInfo(Session):
    # TODO: is this used anywhere?
-    vector_db_id: str | None = None
+    vector_store_id: str | None = None
    started_at: datetime
    owner: User | None = None
    identifier: str | None = None
@ -93,12 +93,12 @@ class AgentPersistence:

        return session_info

-    async def add_vector_db_to_session(self, session_id: str, vector_db_id: str):
+    async def add_vector_db_to_session(self, session_id: str, vector_store_id: str):
        session_info = await self.get_session_if_accessible(session_id)
        if session_info is None:
            raise SessionNotFoundError(session_id)

-        session_info.vector_db_id = vector_db_id
+        session_info.vector_store_id = vector_store_id
        await self.kvstore.set(
            key=f"session:{self.agent_id}:{session_id}",
            value=session_info.model_dump_json(),
--- a/src/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/src/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -119,7 +119,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
    async def insert(
        self,
        documents: list[RAGDocument],
-        vector_db_id: str,
+        vector_store_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
        if not documents:
@ -158,14 +158,14 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti

                try:
                    await self.vector_io_api.openai_attach_file_to_vector_store(
-                        vector_store_id=vector_db_id,
+                        vector_store_id=vector_store_id,
                        file_id=created_file.id,
                        attributes=doc.metadata,
                        chunking_strategy=chunking_strategy,
                    )
                except Exception as e:
                    log.error(
-                        f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
+                        f"Failed to attach file {created_file.id} to vector store {vector_store_id} for document {doc.document_id}: {e}"
                    )
                    continue

@ -176,10 +176,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
    async def query(
        self,
        content: InterleavedContent,
-        vector_db_ids: list[str],
+        vector_store_ids: list[str],
        query_config: RAGQueryConfig | None = None,
    ) -> RAGQueryResult:
-        if not vector_db_ids:
+        if not vector_store_ids:
            raise ValueError(
                "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
            )
@ -192,7 +192,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        )
        tasks = [
            self.vector_io_api.query_chunks(
-                vector_db_id=vector_db_id,
+                vector_store_id=vector_store_id,
                query=query,
                params={
                    "mode": query_config.mode,
@ -201,18 +201,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                    "ranker": query_config.ranker,
                },
            )
-            for vector_db_id in vector_db_ids
+            for vector_store_id in vector_store_ids
        ]
        results: list[QueryChunksResponse] = await asyncio.gather(*tasks)

        chunks = []
        scores = []

-        for vector_db_id, result in zip(vector_db_ids, results, strict=False):
+        for vector_store_id, result in zip(vector_store_ids, results, strict=False):
            for chunk, score in zip(result.chunks, result.scores, strict=False):
                if not hasattr(chunk, "metadata") or chunk.metadata is None:
                    chunk.metadata = {}
-                chunk.metadata["vector_db_id"] = vector_db_id
+                chunk.metadata["vector_store_id"] = vector_store_id

                chunks.append(chunk)
                scores.append(score)
@ -250,7 +250,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
            metadata_keys_to_exclude_from_context = [
                "token_count",
                "metadata_token_count",
-                "vector_db_id",
+                "vector_store_id",
            ]
            metadata_for_context = {}
            for k in chunk_metadata_keys_to_include_from_context:
@ -275,7 +275,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                "document_ids": [c.document_id for c in chunks[: len(picked)]],
                "chunks": [c.content for c in chunks[: len(picked)]],
                "scores": scores[: len(picked)],
-                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
+                "vector_store_ids": [c.metadata["vector_store_id"] for c in chunks[: len(picked)]],
            },
        )

@ -309,7 +309,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        )

    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        vector_db_ids = kwargs.get("vector_db_ids", [])
+        vector_store_ids = kwargs.get("vector_store_ids", [])
        query_config = kwargs.get("query_config")
        if query_config:
            query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
@ -319,7 +319,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        query = kwargs["query"]
        result = await self.query(
            content=query,
-            vector_db_ids=vector_db_ids,
+            vector_store_ids=vector_store_ids,
            query_config=query_config,
        )

--- a/src/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/src/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -248,19 +248,19 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoco
        del self.cache[vector_store_id]
        await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}")

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = self.cache.get(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = self.cache.get(vector_store_id)
        if index is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found. found: {self.cache.keys()}")
+            raise ValueError(f"Vector DB {vector_store_id} not found. found: {self.cache.keys()}")

        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = self.cache.get(vector_db_id)
+        index = self.cache.get(vector_store_id)
        if index is None:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        return await index.query_chunks(query, params)

--- a/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -447,20 +447,20 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresPro
        await self.cache[vector_store_id].index.delete()
        del self.cache[vector_store_id]

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)
        # The VectorStoreWithIndex helper is expected to compute embeddings via the inference_api
        # and then call our index's add_chunks.
        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)
        return await index.query_chunks(query, params)

    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
--- a/src/llama_stack/providers/registry/vector_io.py
+++ b/src/llama_stack/providers/registry/vector_io.py
@ -163,14 +163,14 @@ The SQLite-vec provider supports three search modes:
 Example with hybrid search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
 )

 # Using RRF ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -182,7 +182,7 @@ response = await vector_io.query_chunks(

 # Using weighted ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -196,7 +196,7 @@ response = await vector_io.query_chunks(
 Example with explicit vector search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
 )
@ -205,7 +205,7 @@ response = await vector_io.query_chunks(
 Example with keyword search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
 )
--- a/src/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/src/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -169,20 +169,20 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        await self.cache[vector_store_id].index.delete()
        del self.cache[vector_store_id]

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if index is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
+            raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")

        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)

        if index is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
+            raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")

        return await index.query_chunks(query, params)

--- a/src/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/src/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -348,19 +348,19 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
            await self.cache[vector_store_id].index.delete()
            del self.cache[vector_store_id]

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)
        return await index.query_chunks(query, params)

    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
--- a/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -399,14 +399,14 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
        assert self.kvstore is not None
        await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}")

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        return await index.query_chunks(query, params)

    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex:
--- a/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -222,19 +222,19 @@ class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
        self.cache[vector_store_id] = index
        return index

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        return await index.query_chunks(query, params)

--- a/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -366,19 +366,19 @@ class WeaviateVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, NeedsRequestProv
        self.cache[vector_store_id] = index
        return index

-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+    async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        await index.insert_chunks(chunks)

    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
+        index = await self._get_and_cache_vector_store_index(vector_store_id)
        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
+            raise VectorStoreNotFoundError(vector_store_id)

        return await index.query_chunks(query, params)

--- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -333,7 +333,7 @@ class OpenAIVectorStoreMixin(ABC):
    @abstractmethod
    async def insert_chunks(
        self,
-        vector_db_id: str,
+        vector_store_id: str,
        chunks: list[Chunk],
        ttl_seconds: int | None = None,
    ) -> None:
@ -342,7 +342,7 @@ class OpenAIVectorStoreMixin(ABC):

    @abstractmethod
    async def query_chunks(
-        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
    ) -> QueryChunksResponse:
        """Query chunks from a vector database (provider-specific implementation)."""
        pass
@ -609,7 +609,7 @@ class OpenAIVectorStoreMixin(ABC):
            # TODO: Add support for ranking_options.ranker

            response = await self.query_chunks(
-                vector_db_id=vector_store_id,
+                vector_store_id=vector_store_id,
                query=search_query,
                params=params,
            )
@ -803,7 +803,7 @@ class OpenAIVectorStoreMixin(ABC):
                )
            else:
                await self.insert_chunks(
-                    vector_db_id=vector_store_id,
+                    vector_store_id=vector_store_id,
                    chunks=chunks,
                )
                vector_store_file_object.status = "completed"
--- a/tests/integration/vector_io/test_openai_vector_stores.py
+++ b/tests/integration/vector_io/test_openai_vector_stores.py
@ -367,7 +367,7 @@ def test_openai_vector_store_with_chunks(

    # Insert chunks using the native LlamaStack API (since OpenAI API doesn't have direct chunk insertion)
    llama_client.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )

@ -434,7 +434,7 @@ def test_openai_vector_store_search_relevance(

    # Insert chunks using native API
    llama_client.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )

@ -484,7 +484,7 @@ def test_openai_vector_store_search_with_ranking_options(

    # Insert chunks
    llama_client.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )

@ -544,7 +544,7 @@ def test_openai_vector_store_search_with_high_score_filter(

    # Insert chunks
    llama_client.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )

@ -610,7 +610,7 @@ def test_openai_vector_store_search_with_max_num_results(

    # Insert chunks
    llama_client.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )

@ -1175,7 +1175,7 @@ def test_openai_vector_store_search_modes(
    )

    client_with_models.vector_io.insert(
-        vector_db_id=vector_store.id,
+        vector_store_id=vector_store.id,
        chunks=sample_chunks,
    )
    query = "Python programming language"
--- a/tests/integration/vector_io/test_vector_io.py
+++ b/tests/integration/vector_io/test_vector_io.py
@ -123,12 +123,12 @@ def test_insert_chunks(
    actual_vector_store_id = create_response.id

    client_with_empty_registry.vector_io.insert(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        chunks=sample_chunks,
    )

    response = client_with_empty_registry.vector_io.query(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        query="What is the capital of France?",
    )
    assert response is not None
@ -137,7 +137,7 @@ def test_insert_chunks(

    query, expected_doc_id = test_case
    response = client_with_empty_registry.vector_io.query(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        query=query,
    )
    assert response is not None
@ -174,13 +174,13 @@ def test_insert_chunks_with_precomputed_embeddings(
    ]

    client_with_empty_registry.vector_io.insert(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        chunks=chunks_with_embeddings,
    )

    provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0]
    response = client_with_empty_registry.vector_io.query(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        query="precomputed embedding test",
        params=vector_io_provider_params_dict.get(provider, None),
    )
@ -224,13 +224,13 @@ def test_query_returns_valid_object_when_identical_to_embedding_in_vdb(
    ]

    client_with_empty_registry.vector_io.insert(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        chunks=chunks_with_embeddings,
    )

    provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0]
    response = client_with_empty_registry.vector_io.query(
-        vector_db_id=actual_vector_store_id,
+        vector_store_id=actual_vector_store_id,
        query="duplicate",
        params=vector_io_provider_params_dict.get(provider, None),
    )
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@ -23,14 +23,14 @@ class TestRagQuery:
            config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
        )
        with pytest.raises(ValueError):
-            await rag_tool.query(content=MagicMock(), vector_db_ids=[])
+            await rag_tool.query(content=MagicMock(), vector_store_ids=[])

    async def test_query_chunk_metadata_handling(self):
        rag_tool = MemoryToolRuntimeImpl(
            config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
        )
        content = "test query content"
-        vector_db_ids = ["db1"]
+        vector_store_ids = ["db1"]

        chunk_metadata = ChunkMetadata(
            document_id="doc1",
@ -55,7 +55,7 @@ class TestRagQuery:
        query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0])

        rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response)
-        result = await rag_tool.query(content=content, vector_db_ids=vector_db_ids)
+        result = await rag_tool.query(content=content, vector_store_ids=vector_store_ids)

        assert result is not None
        expected_metadata_string = (
@ -90,7 +90,7 @@ class TestRagQuery:
            files_api=MagicMock(),
        )

-        vector_db_ids = ["db1", "db2"]
+        vector_store_ids = ["db1", "db2"]

        # Fake chunks from each DB
        chunk_metadata1 = ChunkMetadata(
@ -101,7 +101,7 @@ class TestRagQuery:
        )
        chunk1 = Chunk(
            content="chunk from db1",
-            metadata={"vector_db_id": "db1", "document_id": "doc1"},
+            metadata={"vector_store_id": "db1", "document_id": "doc1"},
            stored_chunk_id="c1",
            chunk_metadata=chunk_metadata1,
        )
@ -114,7 +114,7 @@ class TestRagQuery:
        )
        chunk2 = Chunk(
            content="chunk from db2",
-            metadata={"vector_db_id": "db2", "document_id": "doc2"},
+            metadata={"vector_store_id": "db2", "document_id": "doc2"},
            stored_chunk_id="c2",
            chunk_metadata=chunk_metadata2,
        )
@ -126,13 +126,13 @@ class TestRagQuery:
            ]
        )

-        result = await rag_tool.query(content="test", vector_db_ids=vector_db_ids)
+        result = await rag_tool.query(content="test", vector_store_ids=vector_store_ids)
        returned_chunks = result.metadata["chunks"]
        returned_scores = result.metadata["scores"]
        returned_doc_ids = result.metadata["document_ids"]
-        returned_vector_db_ids = result.metadata["vector_db_ids"]
+        returned_vector_store_ids = result.metadata["vector_store_ids"]

        assert returned_chunks == ["chunk from db1", "chunk from db2"]
        assert returned_scores == (0.9, 0.8)
        assert returned_doc_ids == ["doc1", "doc2"]
-        assert returned_vector_db_ids == ["db1", "db2"]
+        assert returned_vector_store_ids == ["db1", "db2"]