fix(telemetry): configure OTEL collector before server starts in integration tests

The telemetry integration tests were failing in server mode because the OTEL
collector configuration was being set up after the server had already started.
This meant the server never received telemetry configuration and couldn't
export spans to the collector, resulting in empty span collections.

Changes:
- Set OTEL environment variables in integration-tests.sh before starting server
- Use LLAMA_STACK_TEST_COLLECTOR_PORT to ensure collector and server use same port
- Simplify conftest.py to not override env vars in server mode since server is already running
- Add verification that collector endpoint matches expected endpoint

This ensures telemetry spans are properly collected during server mode tests.
This commit is contained in:
Ashwin Bharambe 2025-10-30 09:57:28 -07:00
parent a68079feb5
commit a371475cc8
2 changed files with 19 additions and 17 deletions

View file

@ -208,6 +208,15 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
echo "=== Starting Llama Stack Server ==="
export LLAMA_STACK_LOG_WIDTH=120
# Configure telemetry collector for server mode
# Use a fixed port for the OTEL collector so the server can connect to it
COLLECTOR_PORT=4317
export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
export OTEL_BSP_SCHEDULE_DELAY="200"
export OTEL_BSP_EXPORT_TIMEOUT="2000"
# remove "server:" from STACK_CONFIG
stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
nohup llama stack run $stack_config > server.log 2>&1 &

View file

@ -21,33 +21,26 @@ def telemetry_test_collector():
stack_mode = os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE", "library_client")
if stack_mode == "server":
# In server mode, the collector must be started and the server is already running.
# The integration test script (scripts/integration-tests.sh) should have set
# LLAMA_STACK_TEST_COLLECTOR_PORT and OTEL_EXPORTER_OTLP_ENDPOINT before starting the server.
try:
collector = OtlpHttpTestCollector()
except RuntimeError as exc:
pytest.skip(str(exc))
env_overrides = {
"OTEL_EXPORTER_OTLP_ENDPOINT": collector.endpoint,
"OTEL_EXPORTER_OTLP_PROTOCOL": "http/protobuf",
"OTEL_BSP_SCHEDULE_DELAY": "200",
"OTEL_BSP_EXPORT_TIMEOUT": "2000",
}
previous_env = {key: os.environ.get(key) for key in env_overrides}
for key, value in env_overrides.items():
os.environ[key] = value
telemetry_module._TRACER_PROVIDER = None
# Verify the collector is listening on the expected endpoint
expected_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
if expected_endpoint and collector.endpoint != expected_endpoint:
pytest.skip(
f"Collector endpoint mismatch: expected {expected_endpoint}, got {collector.endpoint}. "
"Server was likely started before collector."
)
try:
yield collector
finally:
collector.shutdown()
for key, prior in previous_env.items():
if prior is None:
os.environ.pop(key, None)
else:
os.environ[key] = prior
else:
manager = InMemoryTelemetryManager()
try: