mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-06 14:00:42 +00:00
Some checks failed
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 4s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 8s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 12s
Integration Tests / test-matrix (server, 3.12, agents) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 17s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 20s
Integration Tests / test-matrix (server, 3.12, datasets) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 17s
Integration Tests / test-matrix (server, 3.12, inference) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 20s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 20s
Integration Tests / test-matrix (server, 3.12, inspect) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 17s
Integration Tests / test-matrix (server, 3.12, post_training) (push) Failing after 10s
Integration Tests / test-matrix (server, 3.12, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 10s
Integration Tests / test-matrix (server, 3.12, scoring) (push) Failing after 9s
Integration Tests / test-matrix (server, 3.12, vector_io) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, datasets) (push) Failing after 7s
Integration Tests / test-matrix (server, 3.13, inspect) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.13, providers) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, tool_runtime) (push) Failing after 5s
Integration Tests / test-matrix (server, 3.13, vector_io) (push) Failing after 5s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 4s
Integration Tests / test-matrix (server, 3.12, tool_runtime) (push) Failing after 18s
Integration Tests / test-matrix (server, 3.13, agents) (push) Failing after 19s
Integration Tests / test-matrix (server, 3.13, post_training) (push) Failing after 16s
Integration Tests / test-matrix (server, 3.13, inference) (push) Failing after 18s
Integration Tests / test-matrix (server, 3.13, scoring) (push) Failing after 17s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 14s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 12s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 7s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 15s
Python Package Build Test / build (3.13) (push) Failing after 0s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 17s
Update ReadTheDocs / update-readthedocs (push) Failing after 4s
Test Llama Stack Build / build (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 7s
Test External Providers / test-external-providers (venv) (push) Failing after 5s
Unit Tests / unit-tests (3.13) (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 58s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 1m0s
Python Package Build Test / build (3.12) (push) Failing after 49s
Pre-commit / pre-commit (push) Successful in 1m40s
# What does this PR do? * Use a single env variable to setup OTEL endpoint * Update telemetry provider doc * Update general telemetry doc with the metric with generate * Left a script to setup telemetry for testing Closes: https://github.com/meta-llama/llama-stack/issues/783 Note to reviewer: the `setup_telemetry.sh` script was useful for me, it was nicely generated by AI, if we don't want it in the repo, and I can delete it, and I would understand. Signed-off-by: Sébastien Han <seb@redhat.com>
121 lines
4.5 KiB
Bash
Executable file
121 lines
4.5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
# Telemetry Setup Script for Llama Stack
|
|
# This script sets up Jaeger, OpenTelemetry Collector, Prometheus, and Grafana using Podman
|
|
# For whoever is interested in testing the telemetry stack, you can run this script to set up the stack.
|
|
# export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
|
|
# export TELEMETRY_SINKS=otel_trace,otel_metric
|
|
# export OTEL_SERVICE_NAME=my-llama-app
|
|
# Then run the distro server
|
|
|
|
set -Eeuo pipefail
|
|
|
|
CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
|
|
|
|
echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
|
|
|
|
if ! command -v "$CONTAINER_RUNTIME" &> /dev/null; then
|
|
echo "🚨 $CONTAINER_RUNTIME could not be found"
|
|
echo "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
|
|
exit 1
|
|
fi
|
|
|
|
# Create a network for the services
|
|
echo "📡 Creating $CONTAINER_RUNTIME network..."
|
|
$CONTAINER_RUNTIME network create llama-telemetry 2>/dev/null || echo "Network already exists"
|
|
|
|
# Stop and remove existing containers
|
|
echo "🧹 Cleaning up existing containers..."
|
|
$CONTAINER_RUNTIME stop jaeger otel-collector prometheus grafana 2>/dev/null || true
|
|
$CONTAINER_RUNTIME rm jaeger otel-collector prometheus grafana 2>/dev/null || true
|
|
|
|
# Start Jaeger
|
|
echo "🔍 Starting Jaeger..."
|
|
$CONTAINER_RUNTIME run -d --name jaeger \
|
|
--network llama-telemetry \
|
|
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
|
|
-p 16686:16686 \
|
|
-p 14250:14250 \
|
|
-p 9411:9411 \
|
|
docker.io/jaegertracing/all-in-one:latest
|
|
|
|
# Start OpenTelemetry Collector
|
|
echo "📊 Starting OpenTelemetry Collector..."
|
|
$CONTAINER_RUNTIME run -d --name otel-collector \
|
|
--network llama-telemetry \
|
|
-p 4318:4318 \
|
|
-p 4317:4317 \
|
|
-p 9464:9464 \
|
|
-p 13133:13133 \
|
|
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \
|
|
docker.io/otel/opentelemetry-collector-contrib:latest \
|
|
--config /etc/otel-collector-config.yaml
|
|
|
|
# Start Prometheus
|
|
echo "📈 Starting Prometheus..."
|
|
$CONTAINER_RUNTIME run -d --name prometheus \
|
|
--network llama-telemetry \
|
|
-p 9090:9090 \
|
|
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \
|
|
docker.io/prom/prometheus:latest \
|
|
--config.file=/etc/prometheus/prometheus.yml \
|
|
--storage.tsdb.path=/prometheus \
|
|
--web.console.libraries=/etc/prometheus/console_libraries \
|
|
--web.console.templates=/etc/prometheus/consoles \
|
|
--storage.tsdb.retention.time=200h \
|
|
--web.enable-lifecycle
|
|
|
|
# Start Grafana
|
|
echo "📊 Starting Grafana..."
|
|
$CONTAINER_RUNTIME run -d --name grafana \
|
|
--network llama-telemetry \
|
|
-p 3000:3000 \
|
|
-e GF_SECURITY_ADMIN_PASSWORD=admin \
|
|
-e GF_USERS_ALLOW_SIGN_UP=false \
|
|
docker.io/grafana/grafana:latest
|
|
|
|
# Wait for services to start
|
|
echo "⏳ Waiting for services to start..."
|
|
sleep 10
|
|
|
|
# Check if services are running
|
|
echo "🔍 Checking service status..."
|
|
$CONTAINER_RUNTIME ps --filter "name=jaeger|otel-collector|prometheus|grafana" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
|
|
|
echo ""
|
|
echo "✅ Telemetry stack is ready!"
|
|
echo ""
|
|
echo "🌐 Service URLs:"
|
|
echo " Jaeger UI: http://localhost:16686"
|
|
echo " Prometheus: http://localhost:9090"
|
|
echo " Grafana: http://localhost:3000 (admin/admin)"
|
|
echo " OTEL Collector: http://localhost:4318 (OTLP endpoint)"
|
|
echo ""
|
|
echo "🔧 Environment variables for Llama Stack:"
|
|
echo " export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318"
|
|
echo " export TELEMETRY_SINKS=otel_trace,otel_metric"
|
|
echo " export OTEL_SERVICE_NAME=my-llama-app"
|
|
echo ""
|
|
echo "📊 Next steps:"
|
|
echo " 1. Set the environment variables above"
|
|
echo " 2. Start your Llama Stack application"
|
|
echo " 3. Make some inference calls to generate metrics"
|
|
echo " 4. Check Jaeger for traces: http://localhost:16686"
|
|
echo " 5. Check Prometheus for metrics: http://localhost:9090"
|
|
echo " 6. Set up Grafana dashboards: http://localhost:3000"
|
|
echo ""
|
|
echo "🔍 To test the setup, run:"
|
|
echo " curl -X POST http://localhost:5000/v1/inference/chat/completions \\"
|
|
echo " -H 'Content-Type: application/json' \\"
|
|
echo " -d '{\"model_id\": \"your-model\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
|
|
echo ""
|
|
echo "🧹 To clean up when done:"
|
|
echo " $CONTAINER_RUNTIME stop jaeger otel-collector prometheus grafana"
|
|
echo " $CONTAINER_RUNTIME rm jaeger otel-collector prometheus grafana"
|
|
echo " $CONTAINER_RUNTIME network rm llama-telemetry"
|