From c21bb0e8376b41e9f32d898558106d02bc576363 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 3 Oct 2025 17:36:35 -0700
Subject: [PATCH] chore: fix setup_telemetry script (#3680)

# What does this PR do?
Added missing configuration files

## Test Plan
run ./scripts/telemetry/setup_telemetry.sh
```
OTEL_SERVICE_NAME=llama_stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 TELEMETRY_SINKS=otel_trace,otel_metric uv run --with llama-stack llama stack build --distro=starter --image-type=venv --run
```
Navigate to grafana localhost:3000, query metrics and traces
---
 docs/docs/building_applications/telemetry.mdx | 20 +++++-----
 scripts/telemetry/grafana-datasources.yaml    | 15 +++++++
 scripts/telemetry/otel-collector-config.yaml  | 40 +++++++++++++++++++
 scripts/telemetry/prometheus.yml              | 12 ++++++
 scripts/{ => telemetry}/setup_telemetry.sh    |  9 +++--
 5 files changed, 83 insertions(+), 13 deletions(-)
 create mode 100644 scripts/telemetry/grafana-datasources.yaml
 create mode 100644 scripts/telemetry/otel-collector-config.yaml
 create mode 100644 scripts/telemetry/prometheus.yml
 rename scripts/{ => telemetry}/setup_telemetry.sh (91%)

diff --git a/docs/docs/building_applications/telemetry.mdx b/docs/docs/building_applications/telemetry.mdx
index 655a2043b..d991d97a1 100644
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
 - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
 
-## Visualization with Jaeger
+### Quick Setup: Complete Telemetry Stack
 
-The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
-
-### Starting Jaeger
-
-Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
+Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
 
 ```bash
-docker run --pull always --rm --name jaeger \
-  -p 16686:16686 -p 4318:4318 \
-  jaegertracing/jaeger:2.1.0
+./scripts/telemetry/setup_telemetry.sh
 ```
 
-Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
+This sets up:
+- **Jaeger UI**: http://localhost:16686 (traces visualization)
+- **Prometheus**: http://localhost:9090 (metrics)
+- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
+- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
+
+Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
 
 ## Querying Metrics
 
diff --git a/scripts/telemetry/grafana-datasources.yaml b/scripts/telemetry/grafana-datasources.yaml
new file mode 100644
index 000000000..d01fe04ce
--- /dev/null
+++ b/scripts/telemetry/grafana-datasources.yaml
@@ -0,0 +1,15 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    editable: true
diff --git a/scripts/telemetry/otel-collector-config.yaml b/scripts/telemetry/otel-collector-config.yaml
new file mode 100644
index 000000000..ece1e162c
--- /dev/null
+++ b/scripts/telemetry/otel-collector-config.yaml
@@ -0,0 +1,40 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  batch:
+    timeout: 1s
+    send_batch_size: 1024
+
+exporters:
+  # Export traces to Jaeger
+  otlp/jaeger:
+    endpoint: jaeger:4317
+    tls:
+      insecure: true
+
+  # Export metrics to Prometheus
+  prometheus:
+    endpoint: 0.0.0.0:9464
+    namespace: llama_stack
+
+  # Debug exporter for troubleshooting
+  debug:
+    verbosity: detailed
+
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [otlp/jaeger, debug]
+
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [prometheus, debug]
diff --git a/scripts/telemetry/prometheus.yml b/scripts/telemetry/prometheus.yml
new file mode 100644
index 000000000..c064359ca
--- /dev/null
+++ b/scripts/telemetry/prometheus.yml
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'otel-collector'
+    static_configs:
+      - targets: ['otel-collector:9464']
diff --git a/scripts/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh
similarity index 91%
rename from scripts/setup_telemetry.sh
rename to scripts/telemetry/setup_telemetry.sh
index cf235ab9d..e0b57a354 100755
--- a/scripts/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@@ -17,6 +17,7 @@
 set -Eeuo pipefail
 
 CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
 
@@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \
   -p 4317:4317 \
   -p 9464:9464 \
   -p 13133:13133 \
-  -v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \
+  -v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \
   docker.io/otel/opentelemetry-collector-contrib:latest \
   --config /etc/otel-collector-config.yaml
 
@@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..."
 $CONTAINER_RUNTIME run -d --name prometheus \
   --network llama-telemetry \
   -p 9090:9090 \
-  -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \
+  -v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \
   docker.io/prom/prometheus:latest \
   --config.file=/etc/prometheus/prometheus.yml \
   --storage.tsdb.path=/prometheus \
@@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \
   --web.enable-lifecycle
 
 # Start Grafana
+# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes)
 echo "📊 Starting Grafana..."
 $CONTAINER_RUNTIME run -d --name grafana \
   --network llama-telemetry \
   -p 3000:3000 \
   -e GF_SECURITY_ADMIN_PASSWORD=admin \
   -e GF_USERS_ALLOW_SIGN_UP=false \
-  docker.io/grafana/grafana:latest
+  -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+  docker.io/grafana/grafana:11.0.0
 
 # Wait for services to start
 echo "⏳ Waiting for services to start..."