From c21bb0e8376b41e9f32d898558106d02bc576363 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Fri, 3 Oct 2025 17:36:35 -0700 Subject: [PATCH] chore: fix setup_telemetry script (#3680) # What does this PR do? Added missing configuration files ## Test Plan run ./scripts/telemetry/setup_telemetry.sh ``` OTEL_SERVICE_NAME=llama_stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 TELEMETRY_SINKS=otel_trace,otel_metric uv run --with llama-stack llama stack build --distro=starter --image-type=venv --run ``` Navigate to grafana localhost:3000, query metrics and traces --- docs/docs/building_applications/telemetry.mdx | 20 +++++----- scripts/telemetry/grafana-datasources.yaml | 15 +++++++ scripts/telemetry/otel-collector-config.yaml | 40 +++++++++++++++++++ scripts/telemetry/prometheus.yml | 12 ++++++ scripts/{ => telemetry}/setup_telemetry.sh | 9 +++-- 5 files changed, 83 insertions(+), 13 deletions(-) create mode 100644 scripts/telemetry/grafana-datasources.yaml create mode 100644 scripts/telemetry/otel-collector-config.yaml create mode 100644 scripts/telemetry/prometheus.yml rename scripts/{ => telemetry}/setup_telemetry.sh (91%) diff --git a/docs/docs/building_applications/telemetry.mdx b/docs/docs/building_applications/telemetry.mdx index 655a2043b..d991d97a1 100644 --- a/docs/docs/building_applications/telemetry.mdx +++ b/docs/docs/building_applications/telemetry.mdx @@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables: - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string) - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`) -## Visualization with Jaeger +### Quick Setup: Complete Telemetry Stack -The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector. - -### Starting Jaeger - -Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686: +Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana): ```bash -docker run --pull always --rm --name jaeger \ - -p 16686:16686 -p 4318:4318 \ - jaegertracing/jaeger:2.1.0 +./scripts/telemetry/setup_telemetry.sh ``` -Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/). +This sets up: +- **Jaeger UI**: http://localhost:16686 (traces visualization) +- **Prometheus**: http://localhost:9090 (metrics) +- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources) +- **OTEL Collector**: http://localhost:4318 (OTLP endpoint) + +Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`. ## Querying Metrics diff --git a/scripts/telemetry/grafana-datasources.yaml b/scripts/telemetry/grafana-datasources.yaml new file mode 100644 index 000000000..d01fe04ce --- /dev/null +++ b/scripts/telemetry/grafana-datasources.yaml @@ -0,0 +1,15 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + + - name: Jaeger + type: jaeger + access: proxy + url: http://jaeger:16686 + editable: true diff --git a/scripts/telemetry/otel-collector-config.yaml b/scripts/telemetry/otel-collector-config.yaml new file mode 100644 index 000000000..ece1e162c --- /dev/null +++ b/scripts/telemetry/otel-collector-config.yaml @@ -0,0 +1,40 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +exporters: + # Export traces to Jaeger + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + + # Export metrics to Prometheus + prometheus: + endpoint: 0.0.0.0:9464 + namespace: llama_stack + + # Debug exporter for troubleshooting + debug: + verbosity: detailed + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/jaeger, debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, debug] diff --git a/scripts/telemetry/prometheus.yml b/scripts/telemetry/prometheus.yml new file mode 100644 index 000000000..c064359ca --- /dev/null +++ b/scripts/telemetry/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9464'] diff --git a/scripts/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh similarity index 91% rename from scripts/setup_telemetry.sh rename to scripts/telemetry/setup_telemetry.sh index cf235ab9d..e0b57a354 100755 --- a/scripts/setup_telemetry.sh +++ b/scripts/telemetry/setup_telemetry.sh @@ -17,6 +17,7 @@ set -Eeuo pipefail CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker} +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "🚀 Setting up telemetry stack for Llama Stack using Podman..." @@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \ -p 4317:4317 \ -p 9464:9464 \ -p 13133:13133 \ - -v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \ + -v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \ docker.io/otel/opentelemetry-collector-contrib:latest \ --config /etc/otel-collector-config.yaml @@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..." $CONTAINER_RUNTIME run -d --name prometheus \ --network llama-telemetry \ -p 9090:9090 \ - -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \ + -v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \ docker.io/prom/prometheus:latest \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/prometheus \ @@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \ --web.enable-lifecycle # Start Grafana +# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes) echo "📊 Starting Grafana..." $CONTAINER_RUNTIME run -d --name grafana \ --network llama-telemetry \ -p 3000:3000 \ -e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_USERS_ALLOW_SIGN_UP=false \ - docker.io/grafana/grafana:latest + -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \ + docker.io/grafana/grafana:11.0.0 # Wait for services to start echo "⏳ Waiting for services to start..."