Merge bfe06ae00f into sapling-pr-archive-ehhuang

2025-12-04 18:13:44 +00:00 · 2025-10-27 12:01:12 -07:00 · 2025-10-27 12:01:12 -07:00 · 0b9736b6c1
commit 0b9736b6c1
parent bf3a821247 bfe06ae00f
32 changed files with 793 additions and 12 deletions
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -91,6 +91,9 @@ jobs:
              conversations:
                table_name: openai_conversations
                backend: sql_default
+              prompts:
+                namespace: prompts
+                backend: kv_default
          server:
            port: 8321
          EOF
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -107,13 +107,21 @@ data:
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
+      stores:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
+          max_write_queue_size: 10000
+          num_writers: 4
+        conversations:
+          backend: sql_default
+          table_name: openai_conversations
+        prompts:
+          backend: kv_default
+          namespace: prompts
    models:
    - metadata:
        embedding_dimension: 768
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -100,6 +100,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata:
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -58,13 +58,21 @@ storage:
    sql_default:
      type: sql_sqlite
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
-  references:
+  stores:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      backend: sql_default
+      table_name: openai_conversations
+    prompts:
+      backend: kv_default
+      namespace: prompts
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -113,13 +113,21 @@ data:
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
+      stores:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
+          max_write_queue_size: 10000
+          num_writers: 4
+        conversations:
+          backend: sql_default
+          table_name: openai_conversations
+        prompts:
+          backend: kv_default
+          namespace: prompts
    models:
    - metadata:
        embedding_dimension: 768
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -106,6 +106,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata:
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -589,6 +589,7 @@ can be instantiated multiple times (with different configs) if necessary.
        _ensure_backend(stores.inference, sql_backends, "storage.stores.inference")
        _ensure_backend(stores.conversations, sql_backends, "storage.stores.conversations")
        _ensure_backend(stores.responses, sql_backends, "storage.stores.responses")
+        _ensure_backend(stores.prompts, kv_backends, "storage.stores.prompts")
        return self


--- a/llama_stack/core/prompts/prompts.py
+++ b/llama_stack/core/prompts/prompts.py
@ -11,7 +11,6 @@ from pydantic import BaseModel

 from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
 from llama_stack.core.datatypes import StackRunConfig
-from llama_stack.core.storage.datatypes import KVStoreReference
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl


@ -40,11 +39,10 @@ class PromptServiceImpl(Prompts):
        self.kvstore: KVStore

    async def initialize(self) -> None:
-        # Use metadata store backend with prompts-specific namespace
-        metadata_ref = self.config.run_config.storage.stores.metadata
-        if not metadata_ref:
-            raise ValueError("storage.stores.metadata must be configured in run config")
-        prompts_ref = KVStoreReference(namespace="prompts", backend=metadata_ref.backend)
+        # Use prompts store reference from run config
+        prompts_ref = self.config.run_config.storage.stores.prompts
+        if not prompts_ref:
+            raise ValueError("storage.stores.prompts must be configured in run config")
        self.kvstore = await kvstore_impl(prompts_ref)

    def _get_default_key(self, prompt_id: str) -> str:
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -563,6 +563,7 @@ def run_config_from_adhoc_config_spec(
                metadata=KVStoreReference(backend="kv_default", namespace="registry"),
                inference=InferenceStoreReference(backend="sql_default", table_name="inference_store"),
                conversations=SqlStoreReference(backend="sql_default", table_name="openai_conversations"),
+                prompts=KVStoreReference(backend="kv_default", namespace="prompts"),
            ),
        ),
    )
--- a/llama_stack/core/storage/datatypes.py
+++ b/llama_stack/core/storage/datatypes.py
@ -271,6 +271,10 @@ class ServerStoresConfig(BaseModel):
        default=None,
        description="Responses store configuration (uses SQL backend)",
    )
+    prompts: KVStoreReference | None = Field(
+        default=None,
+        description="Prompts store configuration (uses KV backend)",
+    )


 class StorageConfig(BaseModel):
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -247,6 +247,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models: []
  shields:
--- a/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/llama_stack/distributions/dell/run-with-safety.yaml
@ -109,6 +109,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/dell/run.yaml
+++ b/llama_stack/distributions/dell/run.yaml
@ -105,6 +105,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@ -122,6 +122,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/meta-reference-gpu/run.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run.yaml
@ -112,6 +112,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -111,6 +111,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -100,6 +100,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models: []
  shields: []
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -142,6 +142,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/postgres-demo/run.yaml
+++ b/llama_stack/distributions/postgres-demo/run.yaml
@ -87,6 +87,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -250,6 +250,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models: []
  shields:
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -247,6 +247,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models: []
  shields:
--- a/llama_stack/distributions/template.py
+++ b/llama_stack/distributions/template.py
@ -259,6 +259,10 @@ class RunConfigSettings(BaseModel):
                backend="sql_default",
                table_name="openai_conversations",
            ).model_dump(exclude_none=True),
+            "prompts": KVStoreReference(
+                backend="kv_default",
+                namespace="prompts",
+            ).model_dump(exclude_none=True),
        }

        storage_config = dict(
--- a/llama_stack/distributions/watsonx/run.yaml
+++ b/llama_stack/distributions/watsonx/run.yaml
@ -115,6 +115,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models: []
  shields: []
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -30,8 +30,10 @@ materialize_telemetry_configs() {
  local otel_cfg="${dest}/otel-collector-config.yaml"
  local prom_cfg="${dest}/prometheus.yml"
  local graf_cfg="${dest}/grafana-datasources.yaml"
+  local graf_dash_cfg="${dest}/grafana-dashboards.yaml"
+  local dash_json="${dest}/llama-stack-dashboard.json"

-  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do
+  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do
    if [ -e "$asset" ]; then
      die "Telemetry asset ${asset} already exists; refusing to overwrite"
    fi
@ -103,6 +105,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
+    uid: prometheus
    isDefault: true
    editable: true

@ -112,6 +115,224 @@ datasources:
    url: http://jaeger:16686
    editable: true
 EOF
+
+  cat <<'EOF' > "$graf_dash_cfg"
+apiVersion: 1
+
+providers:
+  - name: 'Llama Stack'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
+EOF
+
+  # Copy the dashboard JSON inline to avoid line-length issues
+  cat > "$dash_json" <<'DASHBOARD_JSON'
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{"color": "green", "value": null}]
+          }
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+      "id": 1,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "prometheus"},
+          "expr": "llama_stack_completion_tokens_total",
+          "legendFormat": "{{model_id}} ({{provider_id}})",
+          "refId": "A"
+        }
+      ],
+      "title": "Completion Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+      "id": 2,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
+      ],
+      "title": "Prompt & Total Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "ms"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+      "id": 3,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
+      ],
+      "title": "HTTP Request Duration (p95, p99)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "textMode": "auto"
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
+      ],
+      "title": "Total Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
+        }
+      },
+      "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "textMode": "auto"
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
+      ],
+      "title": "Active Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "reqps"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+      "id": 6,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
+      ],
+      "title": "Request Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+          "unit": "Bps"
+        }
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+      "id": 7,
+      "options": {
+        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "none"}
+      },
+      "targets": [
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
+        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
+      ],
+      "title": "Request/Response Sizes",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "tags": ["llama-stack"],
+  "templating": {"list": []},
+  "time": {"from": "now-15m", "to": "now"},
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Llama Stack Metrics",
+  "uid": "llama-stack-metrics",
+  "version": 0,
+  "weekStart": ""
+}
+DASHBOARD_JSON
 }

 # Cleanup function to remove temporary files
@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
    -e GF_SECURITY_ADMIN_PASSWORD=admin \
    -e GF_USERS_ALLOW_SIGN_UP=false \
    -v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+    -v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
+    -v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
    docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then
    die "Grafana startup failed"
  fi
--- a/scripts/telemetry/grafana-dashboards.yaml
+++ b/scripts/telemetry/grafana-dashboards.yaml
@ -0,0 +1,13 @@
+apiVersion: 1
+
+providers:
+  - name: 'Llama Stack'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
+
--- a/scripts/telemetry/grafana-datasources.yaml
+++ b/scripts/telemetry/grafana-datasources.yaml
@ -5,6 +5,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
+    uid: prometheus
    isDefault: true
    editable: true

--- a/scripts/telemetry/llama-stack-dashboard.json
+++ b/scripts/telemetry/llama-stack-dashboard.json
@ -0,0 +1,457 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_completion_tokens_total",
+          "legendFormat": "{{model_id}} ({{provider_id}})",
+          "refId": "A"
+        }
+      ],
+      "title": "Completion Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_prompt_tokens_total",
+          "legendFormat": "Prompt - {{model_id}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "llama_stack_tokens_total",
+          "legendFormat": "Total - {{model_id}}",
+          "refId": "B"
+        }
+      ],
+      "title": "Prompt & Total Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ms"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
+          "legendFormat": "p99",
+          "refId": "B"
+        }
+      ],
+      "title": "HTTP Request Duration (p95, p99)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 12,
+        "y": 8
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(llama_stack_http_server_duration_milliseconds_count)",
+          "refId": "A"
+        }
+      ],
+      "title": "Total Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 18,
+        "y": 8
+      },
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(llama_stack_http_server_active_requests)",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "reqps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])",
+          "legendFormat": "{{http_target}} - {{http_status_code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Request Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "showPoints": "auto",
+            "fillOpacity": 10
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "Bps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])",
+          "legendFormat": "Request",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])",
+          "legendFormat": "Response",
+          "refId": "B"
+        }
+      ],
+      "title": "Request/Response Sizes",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "tags": [
+    "llama-stack"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Llama Stack Metrics",
+  "uid": "llama-stack-metrics",
+  "version": 0,
+  "weekStart": ""
+}
--- a/scripts/telemetry/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@ -135,6 +135,8 @@ $CONTAINER_RUNTIME run -d --name grafana \
  -e GF_SECURITY_ADMIN_PASSWORD=admin \
  -e GF_USERS_ALLOW_SIGN_UP=false \
  -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+  -v "$SCRIPT_DIR/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
+  -v "$SCRIPT_DIR/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
  docker.io/grafana/grafana:11.0.0

 # Wait for services to start
--- a/tests/external/run-byoa.yaml
+++ b/tests/external/run-byoa.yaml
@ -25,6 +25,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 external_apis_dir: ~/.llama/apis.d
 external_providers_dir: ~/.llama/providers.d
 server:
--- a/tests/unit/cli/test_stack_config.py
+++ b/tests/unit/cli/test_stack_config.py
@ -44,6 +44,9 @@ def config_with_image_name_int():
            responses:
              backend: sql_default
              table_name: responses
+            prompts:
+              backend: kv_default
+              namespace: prompts
        providers:
          inference:
            - provider_id: provider1
--- a/tests/unit/distribution/test_distribution.py
+++ b/tests/unit/distribution/test_distribution.py
@ -48,6 +48,7 @@ def _default_storage() -> StorageConfig:
            metadata=KVStoreReference(backend="kv_default", namespace="registry"),
            inference=InferenceStoreReference(backend="sql_default", table_name="inference_store"),
            conversations=SqlStoreReference(backend="sql_default", table_name="conversations"),
+            prompts=KVStoreReference(backend="kv_default", namespace="prompts"),
        ),
    )

--- a/tests/unit/prompts/prompts/conftest.py
+++ b/tests/unit/prompts/prompts/conftest.py
@ -18,7 +18,7 @@ from llama_stack.core.storage.datatypes import (
    SqlStoreReference,
    StorageConfig,
 )
-from llama_stack.providers.utils.kvstore import kvstore_impl, register_kvstore_backends
+from llama_stack.providers.utils.kvstore import register_kvstore_backends


@pytest.fixture
@ -38,6 +38,7 @@ async def temp_prompt_store(tmp_path_factory):
            metadata=KVStoreReference(backend="kv_test", namespace="registry"),
            inference=InferenceStoreReference(backend="sql_test", table_name="inference"),
            conversations=SqlStoreReference(backend="sql_test", table_name="conversations"),
+            prompts=KVStoreReference(backend="kv_test", namespace="prompts"),
        ),
    )
    mock_run_config = StackRunConfig(
@ -50,6 +51,6 @@ async def temp_prompt_store(tmp_path_factory):
    store = PromptServiceImpl(config, deps={})

    register_kvstore_backends({"kv_test": storage.backends["kv_test"]})
-    store.kvstore = await kvstore_impl(KVStoreReference(backend="kv_test", namespace="prompts"))
+    await store.initialize()

    yield store