mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
chore: fix setup_telemetry script (#3680)
# What does this PR do? Added missing configuration files ## Test Plan run ./scripts/telemetry/setup_telemetry.sh ``` OTEL_SERVICE_NAME=llama_stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 TELEMETRY_SINKS=otel_trace,otel_metric uv run --with llama-stack llama stack build --distro=starter --image-type=venv --run ``` Navigate to grafana localhost:3000, query metrics and traces
This commit is contained in:
parent
3f36bfaeaa
commit
c21bb0e837
5 changed files with 83 additions and 13 deletions
|
@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
|
||||||
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
|
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
|
||||||
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
|
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
|
||||||
|
|
||||||
## Visualization with Jaeger
|
### Quick Setup: Complete Telemetry Stack
|
||||||
|
|
||||||
The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
|
Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
|
||||||
|
|
||||||
### Starting Jaeger
|
|
||||||
|
|
||||||
Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run --pull always --rm --name jaeger \
|
./scripts/telemetry/setup_telemetry.sh
|
||||||
-p 16686:16686 -p 4318:4318 \
|
|
||||||
jaegertracing/jaeger:2.1.0
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
|
This sets up:
|
||||||
|
- **Jaeger UI**: http://localhost:16686 (traces visualization)
|
||||||
|
- **Prometheus**: http://localhost:9090 (metrics)
|
||||||
|
- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
|
||||||
|
- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
|
||||||
|
|
||||||
|
Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
|
||||||
|
|
||||||
## Querying Metrics
|
## Querying Metrics
|
||||||
|
|
||||||
|
|
15
scripts/telemetry/grafana-datasources.yaml
Normal file
15
scripts/telemetry/grafana-datasources.yaml
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: true
|
||||||
|
|
||||||
|
- name: Jaeger
|
||||||
|
type: jaeger
|
||||||
|
access: proxy
|
||||||
|
url: http://jaeger:16686
|
||||||
|
editable: true
|
40
scripts/telemetry/otel-collector-config.yaml
Normal file
40
scripts/telemetry/otel-collector-config.yaml
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
processors:
|
||||||
|
batch:
|
||||||
|
timeout: 1s
|
||||||
|
send_batch_size: 1024
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
# Export traces to Jaeger
|
||||||
|
otlp/jaeger:
|
||||||
|
endpoint: jaeger:4317
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
# Export metrics to Prometheus
|
||||||
|
prometheus:
|
||||||
|
endpoint: 0.0.0.0:9464
|
||||||
|
namespace: llama_stack
|
||||||
|
|
||||||
|
# Debug exporter for troubleshooting
|
||||||
|
debug:
|
||||||
|
verbosity: detailed
|
||||||
|
|
||||||
|
service:
|
||||||
|
pipelines:
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [batch]
|
||||||
|
exporters: [otlp/jaeger, debug]
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [batch]
|
||||||
|
exporters: [prometheus, debug]
|
12
scripts/telemetry/prometheus.yml
Normal file
12
scripts/telemetry/prometheus.yml
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
- job_name: 'otel-collector'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['otel-collector:9464']
|
|
@ -17,6 +17,7 @@
|
||||||
set -Eeuo pipefail
|
set -Eeuo pipefail
|
||||||
|
|
||||||
CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
|
CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
|
echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
|
||||||
|
|
||||||
|
@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \
|
||||||
-p 4317:4317 \
|
-p 4317:4317 \
|
||||||
-p 9464:9464 \
|
-p 9464:9464 \
|
||||||
-p 13133:13133 \
|
-p 13133:13133 \
|
||||||
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \
|
-v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \
|
||||||
docker.io/otel/opentelemetry-collector-contrib:latest \
|
docker.io/otel/opentelemetry-collector-contrib:latest \
|
||||||
--config /etc/otel-collector-config.yaml
|
--config /etc/otel-collector-config.yaml
|
||||||
|
|
||||||
|
@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..."
|
||||||
$CONTAINER_RUNTIME run -d --name prometheus \
|
$CONTAINER_RUNTIME run -d --name prometheus \
|
||||||
--network llama-telemetry \
|
--network llama-telemetry \
|
||||||
-p 9090:9090 \
|
-p 9090:9090 \
|
||||||
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \
|
-v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \
|
||||||
docker.io/prom/prometheus:latest \
|
docker.io/prom/prometheus:latest \
|
||||||
--config.file=/etc/prometheus/prometheus.yml \
|
--config.file=/etc/prometheus/prometheus.yml \
|
||||||
--storage.tsdb.path=/prometheus \
|
--storage.tsdb.path=/prometheus \
|
||||||
|
@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \
|
||||||
--web.enable-lifecycle
|
--web.enable-lifecycle
|
||||||
|
|
||||||
# Start Grafana
|
# Start Grafana
|
||||||
|
# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes)
|
||||||
echo "📊 Starting Grafana..."
|
echo "📊 Starting Grafana..."
|
||||||
$CONTAINER_RUNTIME run -d --name grafana \
|
$CONTAINER_RUNTIME run -d --name grafana \
|
||||||
--network llama-telemetry \
|
--network llama-telemetry \
|
||||||
-p 3000:3000 \
|
-p 3000:3000 \
|
||||||
-e GF_SECURITY_ADMIN_PASSWORD=admin \
|
-e GF_SECURITY_ADMIN_PASSWORD=admin \
|
||||||
-e GF_USERS_ALLOW_SIGN_UP=false \
|
-e GF_USERS_ALLOW_SIGN_UP=false \
|
||||||
docker.io/grafana/grafana:latest
|
-v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
|
||||||
|
docker.io/grafana/grafana:11.0.0
|
||||||
|
|
||||||
# Wait for services to start
|
# Wait for services to start
|
||||||
echo "⏳ Waiting for services to start..."
|
echo "⏳ Waiting for services to start..."
|
Loading…
Add table
Add a link
Reference in a new issue