mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
chore: fix setup_telemetry script (#3680)
# What does this PR do? Added missing configuration files ## Test Plan run ./scripts/telemetry/setup_telemetry.sh ``` OTEL_SERVICE_NAME=llama_stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 TELEMETRY_SINKS=otel_trace,otel_metric uv run --with llama-stack llama stack build --distro=starter --image-type=venv --run ``` Navigate to grafana localhost:3000, query metrics and traces
This commit is contained in:
parent
3f36bfaeaa
commit
c21bb0e837
5 changed files with 83 additions and 13 deletions
|
@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
|
|||
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
|
||||
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
|
||||
|
||||
## Visualization with Jaeger
|
||||
### Quick Setup: Complete Telemetry Stack
|
||||
|
||||
The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
|
||||
|
||||
### Starting Jaeger
|
||||
|
||||
Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
|
||||
Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
|
||||
|
||||
```bash
|
||||
docker run --pull always --rm --name jaeger \
|
||||
-p 16686:16686 -p 4318:4318 \
|
||||
jaegertracing/jaeger:2.1.0
|
||||
./scripts/telemetry/setup_telemetry.sh
|
||||
```
|
||||
|
||||
Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
|
||||
This sets up:
|
||||
- **Jaeger UI**: http://localhost:16686 (traces visualization)
|
||||
- **Prometheus**: http://localhost:9090 (metrics)
|
||||
- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
|
||||
- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
|
||||
|
||||
Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
|
||||
|
||||
## Querying Metrics
|
||||
|
||||
|
|
15
scripts/telemetry/grafana-datasources.yaml
Normal file
15
scripts/telemetry/grafana-datasources.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
|
||||
- name: Jaeger
|
||||
type: jaeger
|
||||
access: proxy
|
||||
url: http://jaeger:16686
|
||||
editable: true
|
40
scripts/telemetry/otel-collector-config.yaml
Normal file
40
scripts/telemetry/otel-collector-config.yaml
Normal file
|
@ -0,0 +1,40 @@
|
|||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
|
||||
exporters:
|
||||
# Export traces to Jaeger
|
||||
otlp/jaeger:
|
||||
endpoint: jaeger:4317
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Export metrics to Prometheus
|
||||
prometheus:
|
||||
endpoint: 0.0.0.0:9464
|
||||
namespace: llama_stack
|
||||
|
||||
# Debug exporter for troubleshooting
|
||||
debug:
|
||||
verbosity: detailed
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [otlp/jaeger, debug]
|
||||
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [prometheus, debug]
|
12
scripts/telemetry/prometheus.yml
Normal file
12
scripts/telemetry/prometheus.yml
Normal file
|
@ -0,0 +1,12 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'otel-collector'
|
||||
static_configs:
|
||||
- targets: ['otel-collector:9464']
|
|
@ -17,6 +17,7 @@
|
|||
set -Eeuo pipefail
|
||||
|
||||
CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
|
||||
|
||||
|
@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \
|
|||
-p 4317:4317 \
|
||||
-p 9464:9464 \
|
||||
-p 13133:13133 \
|
||||
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \
|
||||
-v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \
|
||||
docker.io/otel/opentelemetry-collector-contrib:latest \
|
||||
--config /etc/otel-collector-config.yaml
|
||||
|
||||
|
@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..."
|
|||
$CONTAINER_RUNTIME run -d --name prometheus \
|
||||
--network llama-telemetry \
|
||||
-p 9090:9090 \
|
||||
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \
|
||||
-v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \
|
||||
docker.io/prom/prometheus:latest \
|
||||
--config.file=/etc/prometheus/prometheus.yml \
|
||||
--storage.tsdb.path=/prometheus \
|
||||
|
@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \
|
|||
--web.enable-lifecycle
|
||||
|
||||
# Start Grafana
|
||||
# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes)
|
||||
echo "📊 Starting Grafana..."
|
||||
$CONTAINER_RUNTIME run -d --name grafana \
|
||||
--network llama-telemetry \
|
||||
-p 3000:3000 \
|
||||
-e GF_SECURITY_ADMIN_PASSWORD=admin \
|
||||
-e GF_USERS_ALLOW_SIGN_UP=false \
|
||||
docker.io/grafana/grafana:latest
|
||||
-v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
|
||||
docker.io/grafana/grafana:11.0.0
|
||||
|
||||
# Wait for services to start
|
||||
echo "⏳ Waiting for services to start..."
|
Loading…
Add table
Add a link
Reference in a new issue