# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. """ Integration tests for OpenTelemetry provider. These tests verify that the OTel provider correctly: - Initializes within the Llama Stack - Captures expected metrics (counters, histograms, up/down counters) - Captures expected spans/traces - Exports telemetry data to an OTLP collector (in-memory for testing) Tests use in-memory exporters to avoid external dependencies and can run in GitHub Actions. """ import os import time from collections import defaultdict from unittest.mock import patch import pytest from opentelemetry.sdk.metrics.export import InMemoryMetricReader from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter from llama_stack.providers.inline.telemetry.otel.config import OTelTelemetryConfig from llama_stack.providers.inline.telemetry.otel.otel import OTelTelemetryProvider @pytest.fixture(scope="module") def in_memory_span_exporter(): """Create an in-memory span exporter to capture traces.""" return InMemorySpanExporter() @pytest.fixture(scope="module") def in_memory_metric_reader(): """Create an in-memory metric reader to capture metrics.""" return InMemoryMetricReader() @pytest.fixture(scope="module") def otel_provider_with_memory_exporters(in_memory_span_exporter, in_memory_metric_reader): """ Create an OTelTelemetryProvider configured with in-memory exporters. This allows us to capture and verify telemetry data without external services. Returns a dict with 'provider', 'span_exporter', and 'metric_reader'. """ # Set mock environment to avoid warnings os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4318" config = OTelTelemetryConfig( service_name="test-llama-stack-otel", service_version="1.0.0-test", deployment_environment="ci-test", span_processor="simple", ) # Patch the provider to use in-memory exporters with patch.object( OTelTelemetryProvider, 'model_post_init', lambda self, _: _init_with_memory_exporters( self, config, in_memory_span_exporter, in_memory_metric_reader ) ): provider = OTelTelemetryProvider(config=config) yield { 'provider': provider, 'span_exporter': in_memory_span_exporter, 'metric_reader': in_memory_metric_reader } def _init_with_memory_exporters(provider, config, span_exporter, metric_reader): """Helper to initialize provider with in-memory exporters.""" import threading from opentelemetry import metrics, trace from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.resources import Attributes, Resource from opentelemetry.sdk.trace import TracerProvider # Initialize pydantic private attributes if provider.__pydantic_private__ is None: provider.__pydantic_private__ = {} provider._lock = threading.Lock() provider._counters = {} provider._up_down_counters = {} provider._histograms = {} provider._gauges = {} # Create resource attributes attributes: Attributes = { key: value for key, value in { "service.name": config.service_name, "service.version": config.service_version, "deployment.environment": config.deployment_environment, }.items() if value is not None } resource = Resource.create(attributes) # Configure tracer provider with in-memory exporter tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter)) trace.set_tracer_provider(tracer_provider) # Configure meter provider with in-memory reader meter_provider = MeterProvider( resource=resource, metric_readers=[metric_reader] ) metrics.set_meter_provider(meter_provider) class TestOTelProviderInitialization: """Test OTel provider initialization within Llama Stack.""" def test_provider_initializes_successfully(self, otel_provider_with_memory_exporters): """Test that the OTel provider initializes without errors.""" provider = otel_provider_with_memory_exporters['provider'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] assert provider is not None assert provider.config.service_name == "test-llama-stack-otel" assert provider.config.service_version == "1.0.0-test" assert provider.config.deployment_environment == "ci-test" def test_provider_has_thread_safety_mechanisms(self, otel_provider_with_memory_exporters): """Test that the provider has thread-safety mechanisms in place.""" provider = otel_provider_with_memory_exporters['provider'] assert hasattr(provider, "_lock") assert provider._lock is not None assert hasattr(provider, "_counters") assert hasattr(provider, "_histograms") assert hasattr(provider, "_up_down_counters") class TestOTelMetricsCapture: """Test that OTel provider captures expected metrics.""" def test_counter_metric_is_captured(self, otel_provider_with_memory_exporters): """Test that counter metrics are captured.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record counter metrics provider.record_count("llama.requests.total", 1.0, attributes={"endpoint": "/chat"}) provider.record_count("llama.requests.total", 1.0, attributes={"endpoint": "/chat"}) provider.record_count("llama.requests.total", 1.0, attributes={"endpoint": "/embeddings"}) # Force metric collection - collect() triggers the reader to gather metrics metric_reader.collect() metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Verify metrics were captured assert metrics_data is not None assert len(metrics_data.resource_metrics) > 0 # Find our counter metric found_counter = False for resource_metric in metrics_data.resource_metrics: for scope_metric in resource_metric.scope_metrics: for metric in scope_metric.metrics: if metric.name == "llama.requests.total": found_counter = True # Verify it's a counter with data points assert hasattr(metric.data, "data_points") assert len(metric.data.data_points) > 0 assert found_counter, "Counter metric 'llama.requests.total' was not captured" def test_histogram_metric_is_captured(self, otel_provider_with_memory_exporters): """Test that histogram metrics are captured.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record histogram metrics with various values latencies = [10.5, 25.3, 50.1, 100.7, 250.2] for latency in latencies: provider.record_histogram( "llama.inference.latency", latency, attributes={"model": "llama-3.2"} ) # Force metric collection metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Find our histogram metric found_histogram = False for resource_metric in metrics_data.resource_metrics: for scope_metric in resource_metric.scope_metrics: for metric in scope_metric.metrics: if metric.name == "llama.inference.latency": found_histogram = True # Verify it's a histogram assert hasattr(metric.data, "data_points") data_point = metric.data.data_points[0] # Histograms should have count and sum assert hasattr(data_point, "count") assert data_point.count == len(latencies) assert found_histogram, "Histogram metric 'llama.inference.latency' was not captured" def test_up_down_counter_metric_is_captured(self, otel_provider_with_memory_exporters): """Test that up/down counter metrics are captured.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record up/down counter metrics provider.record_up_down_counter("llama.active.sessions", 5) provider.record_up_down_counter("llama.active.sessions", 3) provider.record_up_down_counter("llama.active.sessions", -2) # Force metric collection metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Find our up/down counter metric found_updown = False for resource_metric in metrics_data.resource_metrics: for scope_metric in resource_metric.scope_metrics: for metric in scope_metric.metrics: if metric.name == "llama.active.sessions": found_updown = True assert hasattr(metric.data, "data_points") assert len(metric.data.data_points) > 0 assert found_updown, "Up/Down counter metric 'llama.active.sessions' was not captured" def test_metrics_with_attributes_are_captured(self, otel_provider_with_memory_exporters): """Test that metric attributes/labels are preserved.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record metrics with different attributes provider.record_count("llama.tokens.generated", 150.0, attributes={ "model": "llama-3.2-1b", "user": "test-user" }) # Force metric collection metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Verify attributes are preserved found_with_attributes = False for resource_metric in metrics_data.resource_metrics: for scope_metric in resource_metric.scope_metrics: for metric in scope_metric.metrics: if metric.name == "llama.tokens.generated": data_point = metric.data.data_points[0] # Check attributes - they're already a dict in the SDK attrs = data_point.attributes if isinstance(data_point.attributes, dict) else {} if "model" in attrs and "user" in attrs: found_with_attributes = True assert attrs["model"] == "llama-3.2-1b" assert attrs["user"] == "test-user" assert found_with_attributes, "Metrics with attributes were not properly captured" def test_multiple_metric_types_coexist(self, otel_provider_with_memory_exporters): """Test that different metric types can coexist.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record various metric types provider.record_count("test.counter", 1.0) provider.record_histogram("test.histogram", 42.0) provider.record_up_down_counter("test.gauge", 10) # Force metric collection metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Count unique metrics metric_names = set() for resource_metric in metrics_data.resource_metrics: for scope_metric in resource_metric.scope_metrics: for metric in scope_metric.metrics: metric_names.add(metric.name) # Should have all three metrics assert "test.counter" in metric_names assert "test.histogram" in metric_names assert "test.gauge" in metric_names class TestOTelSpansCapture: """Test that OTel provider captures expected spans/traces.""" def test_basic_span_is_captured(self, otel_provider_with_memory_exporters): """Test that basic spans are captured.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Create a span span = provider.custom_trace("llama.inference.request") span.end() # Get captured spans spans = span_exporter.get_finished_spans() assert len(spans) > 0 assert any(span.name == "llama.inference.request" for span in spans) def test_span_with_attributes_is_captured(self, otel_provider_with_memory_exporters): """Test that span attributes are preserved.""" provider = otel_provider_with_memory_exporters['provider'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Create a span with attributes span = provider.custom_trace( "llama.chat.completion", attributes={ "model.id": "llama-3.2-1b", "user.id": "test-user-123", "request.id": "req-abc-123" } ) span.end() # Get captured spans spans = span_exporter.get_finished_spans() # Find our span our_span = None for s in spans: if s.name == "llama.chat.completion": our_span = s break assert our_span is not None, "Span 'llama.chat.completion' was not captured" # Verify attributes attrs = dict(our_span.attributes) assert attrs.get("model.id") == "llama-3.2-1b" assert attrs.get("user.id") == "test-user-123" assert attrs.get("request.id") == "req-abc-123" def test_multiple_spans_are_captured(self, otel_provider_with_memory_exporters): """Test that multiple spans are captured.""" provider = otel_provider_with_memory_exporters['provider'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Create multiple spans span_names = [ "llama.request.validate", "llama.model.load", "llama.inference.execute", "llama.response.format" ] for name in span_names: span = provider.custom_trace(name) time.sleep(0.01) # Small delay to ensure ordering span.end() # Get captured spans spans = span_exporter.get_finished_spans() captured_names = {span.name for span in spans} # Verify all spans were captured for expected_name in span_names: assert expected_name in captured_names, f"Span '{expected_name}' was not captured" def test_span_has_service_metadata(self, otel_provider_with_memory_exporters): """Test that spans include service metadata.""" provider = otel_provider_with_memory_exporters['provider'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Create a span span = provider.custom_trace("test.span") span.end() # Get captured spans spans = span_exporter.get_finished_spans() assert len(spans) > 0 # Check resource attributes span = spans[0] resource_attrs = dict(span.resource.attributes) assert resource_attrs.get("service.name") == "test-llama-stack-otel" assert resource_attrs.get("service.version") == "1.0.0-test" assert resource_attrs.get("deployment.environment") == "ci-test" class TestOTelDataExport: """Test that telemetry data can be exported to OTLP collector.""" def test_metrics_are_exportable(self, otel_provider_with_memory_exporters): """Test that metrics can be exported.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] # Record metrics provider.record_count("export.test.counter", 5.0) provider.record_histogram("export.test.histogram", 123.45) # Force export metric_reader.collect() metrics_data = metric_reader.get_metrics_data() # Verify data structure is exportable assert metrics_data is not None assert hasattr(metrics_data, "resource_metrics") assert len(metrics_data.resource_metrics) > 0 # Verify resource attributes are present (needed for OTLP export) resource = metrics_data.resource_metrics[0].resource assert resource is not None assert len(resource.attributes) > 0 def test_spans_are_exportable(self, otel_provider_with_memory_exporters): """Test that spans can be exported.""" provider = otel_provider_with_memory_exporters['provider'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Create spans span1 = provider.custom_trace("export.test.span1") span1.end() span2 = provider.custom_trace("export.test.span2") span2.end() # Get exported spans spans = span_exporter.get_finished_spans() # Verify spans have required OTLP fields assert len(spans) >= 2 for span in spans: assert span.name is not None assert span.context is not None assert span.context.trace_id is not None assert span.context.span_id is not None assert span.resource is not None def test_concurrent_export_is_safe(self, otel_provider_with_memory_exporters): """Test that concurrent metric/span recording doesn't break export.""" import concurrent.futures provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] def record_data(thread_id): for i in range(10): provider.record_count(f"concurrent.counter.{thread_id}", 1.0) span = provider.custom_trace(f"concurrent.span.{thread_id}.{i}") span.end() # Record from multiple threads with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(record_data, i) for i in range(5)] concurrent.futures.wait(futures) # Verify export still works metric_reader.collect() metrics_data = metric_reader.get_metrics_data() spans = span_exporter.get_finished_spans() assert metrics_data is not None assert len(spans) >= 50 # 5 threads * 10 spans each @pytest.mark.integration class TestOTelProviderIntegration: """End-to-end integration tests simulating real usage.""" def test_complete_inference_workflow_telemetry(self, otel_provider_with_memory_exporters): """Simulate a complete inference workflow with telemetry.""" provider = otel_provider_with_memory_exporters['provider'] metric_reader = otel_provider_with_memory_exporters['metric_reader'] span_exporter = otel_provider_with_memory_exporters['span_exporter'] # Simulate inference workflow request_span = provider.custom_trace( "llama.inference.request", attributes={"model": "llama-3.2-1b", "user": "test"} ) # Track metrics during inference provider.record_count("llama.requests.received", 1.0) provider.record_up_down_counter("llama.requests.in_flight", 1) # Simulate processing time time.sleep(0.01) provider.record_histogram("llama.request.duration_ms", 10.5) # Track tokens provider.record_count("llama.tokens.input", 25.0) provider.record_count("llama.tokens.output", 150.0) # End request provider.record_up_down_counter("llama.requests.in_flight", -1) provider.record_count("llama.requests.completed", 1.0) request_span.end() # Verify all telemetry was captured metric_reader.collect() metrics_data = metric_reader.get_metrics_data() spans = span_exporter.get_finished_spans() # Check metrics exist metric_names = set() for rm in metrics_data.resource_metrics: for sm in rm.scope_metrics: for m in sm.metrics: metric_names.add(m.name) assert "llama.requests.received" in metric_names assert "llama.requests.in_flight" in metric_names assert "llama.request.duration_ms" in metric_names assert "llama.tokens.input" in metric_names assert "llama.tokens.output" in metric_names # Check span exists assert any(s.name == "llama.inference.request" for s in spans)