fix(telemetry): add integration and unit tests for otel provider

2025-10-04 12:07:34 +00:00 · 2025-10-02 17:46:53 -04:00 · 2025-10-02 17:46:53 -04:00 · 9a0294ab4f
commit 9a0294ab4f
parent e45e77f7b0
11 changed files with 1052 additions and 30 deletions
--- a/tests/unit/providers/telemetry/test_otel.py
+++ b/tests/unit/providers/telemetry/test_otel.py
@ -0,0 +1,368 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import concurrent.futures
+import threading
+from unittest.mock import MagicMock
+
+import pytest
+
+from llama_stack.providers.inline.telemetry.otel.config import OTelTelemetryConfig
+from llama_stack.providers.inline.telemetry.otel.otel import OTelTelemetryProvider
+
+
+@pytest.fixture
+def otel_config():
+    """Fixture providing a basic OTelTelemetryConfig."""
+    return OTelTelemetryConfig(
+        service_name="test-service",
+        service_version="1.0.0",
+        deployment_environment="test",
+        span_processor="simple",
+    )
+
+
+@pytest.fixture
+def otel_provider(otel_config, monkeypatch):
+    """Fixture providing an OTelTelemetryProvider instance with mocked environment."""
+    # Set required environment variables to avoid warnings
+    monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
+    return OTelTelemetryProvider(config=otel_config)
+
+
+class TestOTelTelemetryProviderInitialization:
+    """Tests for OTelTelemetryProvider initialization."""
+
+    def test_initialization_with_valid_config(self, otel_config, monkeypatch):
+        """Test that provider initializes correctly with valid configuration."""
+        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
+        
+        provider = OTelTelemetryProvider(config=otel_config)
+        
+        assert provider.config == otel_config
+        assert hasattr(provider, "_lock")
+        assert provider._lock is not None
+        assert isinstance(provider._counters, dict)
+        assert isinstance(provider._histograms, dict)
+        assert isinstance(provider._up_down_counters, dict)
+        assert isinstance(provider._gauges, dict)
+
+    def test_initialization_sets_service_attributes(self, otel_config, monkeypatch):
+        """Test that service attributes are properly configured."""
+        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
+        
+        provider = OTelTelemetryProvider(config=otel_config)
+        
+        assert provider.config.service_name == "test-service"
+        assert provider.config.service_version == "1.0.0"
+        assert provider.config.deployment_environment == "test"
+
+    def test_initialization_with_batch_processor(self, monkeypatch):
+        """Test initialization with batch span processor."""
+        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
+        config = OTelTelemetryConfig(
+            service_name="test-service",
+            service_version="1.0.0",
+            deployment_environment="test",
+            span_processor="batch",
+        )
+        
+        provider = OTelTelemetryProvider(config=config)
+        
+        assert provider.config.span_processor == "batch"
+
+    def test_warns_when_endpoints_missing(self, otel_config, monkeypatch, caplog):
+        """Test that warnings are issued when OTLP endpoints are not set."""
+        # Remove all endpoint environment variables
+        monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
+        monkeypatch.delenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", raising=False)
+        monkeypatch.delenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", raising=False)
+        
+        OTelTelemetryProvider(config=otel_config)
+        
+        # Check that warnings were logged
+        assert any("Traces will not be exported" in record.message for record in caplog.records)
+        assert any("Metrics will not be exported" in record.message for record in caplog.records)
+
+
+class TestOTelTelemetryProviderMetrics:
+    """Tests for metric recording functionality."""
+
+    def test_record_count_creates_counter(self, otel_provider):
+        """Test that record_count creates a counter on first call."""
+        assert "test_counter" not in otel_provider._counters
+        
+        otel_provider.record_count("test_counter", 1.0)
+        
+        assert "test_counter" in otel_provider._counters
+        assert otel_provider._counters["test_counter"] is not None
+
+    def test_record_count_reuses_counter(self, otel_provider):
+        """Test that record_count reuses existing counter."""
+        otel_provider.record_count("test_counter", 1.0)
+        first_counter = otel_provider._counters["test_counter"]
+        
+        otel_provider.record_count("test_counter", 2.0)
+        second_counter = otel_provider._counters["test_counter"]
+        
+        assert first_counter is second_counter
+        assert len(otel_provider._counters) == 1
+
+    def test_record_count_with_attributes(self, otel_provider):
+        """Test that record_count works with attributes."""
+        otel_provider.record_count(
+            "test_counter",
+            1.0,
+            attributes={"key": "value", "env": "test"}
+        )
+        
+        assert "test_counter" in otel_provider._counters
+
+    def test_record_histogram_creates_histogram(self, otel_provider):
+        """Test that record_histogram creates a histogram on first call."""
+        assert "test_histogram" not in otel_provider._histograms
+        
+        otel_provider.record_histogram("test_histogram", 42.5)
+        
+        assert "test_histogram" in otel_provider._histograms
+        assert otel_provider._histograms["test_histogram"] is not None
+
+    def test_record_histogram_reuses_histogram(self, otel_provider):
+        """Test that record_histogram reuses existing histogram."""
+        otel_provider.record_histogram("test_histogram", 10.0)
+        first_histogram = otel_provider._histograms["test_histogram"]
+        
+        otel_provider.record_histogram("test_histogram", 20.0)
+        second_histogram = otel_provider._histograms["test_histogram"]
+        
+        assert first_histogram is second_histogram
+        assert len(otel_provider._histograms) == 1
+
+    def test_record_histogram_with_bucket_boundaries(self, otel_provider):
+        """Test that record_histogram works with explicit bucket boundaries."""
+        boundaries = [0.0, 10.0, 50.0, 100.0]
+        
+        otel_provider.record_histogram(
+            "test_histogram",
+            25.0,
+            explicit_bucket_boundaries_advisory=boundaries
+        )
+        
+        assert "test_histogram" in otel_provider._histograms
+
+    def test_record_up_down_counter_creates_counter(self, otel_provider):
+        """Test that record_up_down_counter creates a counter on first call."""
+        assert "test_updown" not in otel_provider._up_down_counters
+        
+        otel_provider.record_up_down_counter("test_updown", 1.0)
+        
+        assert "test_updown" in otel_provider._up_down_counters
+        assert otel_provider._up_down_counters["test_updown"] is not None
+
+    def test_record_up_down_counter_reuses_counter(self, otel_provider):
+        """Test that record_up_down_counter reuses existing counter."""
+        otel_provider.record_up_down_counter("test_updown", 5.0)
+        first_counter = otel_provider._up_down_counters["test_updown"]
+        
+        otel_provider.record_up_down_counter("test_updown", -3.0)
+        second_counter = otel_provider._up_down_counters["test_updown"]
+        
+        assert first_counter is second_counter
+        assert len(otel_provider._up_down_counters) == 1
+
+    def test_multiple_metrics_with_different_names(self, otel_provider):
+        """Test that multiple metrics with different names are cached separately."""
+        otel_provider.record_count("counter1", 1.0)
+        otel_provider.record_count("counter2", 2.0)
+        otel_provider.record_histogram("histogram1", 10.0)
+        otel_provider.record_up_down_counter("updown1", 5.0)
+        
+        assert len(otel_provider._counters) == 2
+        assert len(otel_provider._histograms) == 1
+        assert len(otel_provider._up_down_counters) == 1
+
+
+class TestOTelTelemetryProviderThreadSafety:
+    """Tests for thread safety of metric operations."""
+
+    def test_concurrent_counter_creation_same_name(self, otel_provider):
+        """Test that concurrent calls to record_count with same name are thread-safe."""
+        num_threads = 50
+        counter_name = "concurrent_counter"
+        
+        def record_metric():
+            otel_provider.record_count(counter_name, 1.0)
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(record_metric) for _ in range(num_threads)]
+            concurrent.futures.wait(futures)
+        
+        # Should have exactly one counter created despite concurrent access
+        assert len(otel_provider._counters) == 1
+        assert counter_name in otel_provider._counters
+
+    def test_concurrent_histogram_creation_same_name(self, otel_provider):
+        """Test that concurrent calls to record_histogram with same name are thread-safe."""
+        num_threads = 50
+        histogram_name = "concurrent_histogram"
+        
+        def record_metric():
+            thread_id = threading.current_thread().ident or 0
+            otel_provider.record_histogram(histogram_name, float(thread_id % 100))
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(record_metric) for _ in range(num_threads)]
+            concurrent.futures.wait(futures)
+        
+        # Should have exactly one histogram created despite concurrent access
+        assert len(otel_provider._histograms) == 1
+        assert histogram_name in otel_provider._histograms
+
+    def test_concurrent_up_down_counter_creation_same_name(self, otel_provider):
+        """Test that concurrent calls to record_up_down_counter with same name are thread-safe."""
+        num_threads = 50
+        counter_name = "concurrent_updown"
+        
+        def record_metric():
+            otel_provider.record_up_down_counter(counter_name, 1.0)
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(record_metric) for _ in range(num_threads)]
+            concurrent.futures.wait(futures)
+        
+        # Should have exactly one counter created despite concurrent access
+        assert len(otel_provider._up_down_counters) == 1
+        assert counter_name in otel_provider._up_down_counters
+
+    def test_concurrent_mixed_metrics_different_names(self, otel_provider):
+        """Test concurrent creation of different metric types with different names."""
+        num_threads = 30
+        
+        def record_counters(thread_id):
+            otel_provider.record_count(f"counter_{thread_id}", 1.0)
+        
+        def record_histograms(thread_id):
+            otel_provider.record_histogram(f"histogram_{thread_id}", float(thread_id))
+        
+        def record_up_down_counters(thread_id):
+            otel_provider.record_up_down_counter(f"updown_{thread_id}", float(thread_id))
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads * 3) as executor:
+            futures = []
+            for i in range(num_threads):
+                futures.append(executor.submit(record_counters, i))
+                futures.append(executor.submit(record_histograms, i))
+                futures.append(executor.submit(record_up_down_counters, i))
+            
+            concurrent.futures.wait(futures)
+        
+        # Each thread should have created its own metric
+        assert len(otel_provider._counters) == num_threads
+        assert len(otel_provider._histograms) == num_threads
+        assert len(otel_provider._up_down_counters) == num_threads
+
+    def test_concurrent_access_existing_and_new_metrics(self, otel_provider):
+        """Test concurrent access mixing existing and new metric creation."""
+        # Pre-create some metrics
+        otel_provider.record_count("existing_counter", 1.0)
+        otel_provider.record_histogram("existing_histogram", 10.0)
+        
+        num_threads = 40
+        
+        def mixed_operations(thread_id):
+            # Half the threads use existing metrics, half create new ones
+            if thread_id % 2 == 0:
+                otel_provider.record_count("existing_counter", 1.0)
+                otel_provider.record_histogram("existing_histogram", float(thread_id))
+            else:
+                otel_provider.record_count(f"new_counter_{thread_id}", 1.0)
+                otel_provider.record_histogram(f"new_histogram_{thread_id}", float(thread_id))
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(mixed_operations, i) for i in range(num_threads)]
+            concurrent.futures.wait(futures)
+        
+        # Should have existing metrics plus half of num_threads new ones
+        expected_new_counters = num_threads // 2
+        expected_new_histograms = num_threads // 2
+        
+        assert len(otel_provider._counters) == 1 + expected_new_counters
+        assert len(otel_provider._histograms) == 1 + expected_new_histograms
+
+
+class TestOTelTelemetryProviderTracing:
+    """Tests for tracing functionality."""
+
+    def test_custom_trace_creates_span(self, otel_provider):
+        """Test that custom_trace creates a span."""
+        span = otel_provider.custom_trace("test_span")
+        
+        assert span is not None
+        assert hasattr(span, "get_span_context")
+
+    def test_custom_trace_with_attributes(self, otel_provider):
+        """Test that custom_trace works with attributes."""
+        attributes = {"key": "value", "operation": "test"}
+        
+        span = otel_provider.custom_trace("test_span", attributes=attributes)
+        
+        assert span is not None
+
+    def test_fastapi_middleware(self, otel_provider):
+        """Test that fastapi_middleware can be called."""
+        mock_app = MagicMock()
+        
+        # Should not raise an exception
+        otel_provider.fastapi_middleware(mock_app)
+
+
+class TestOTelTelemetryProviderEdgeCases:
+    """Tests for edge cases and error conditions."""
+
+    def test_record_count_with_zero(self, otel_provider):
+        """Test that record_count works with zero value."""
+        otel_provider.record_count("zero_counter", 0.0)
+        
+        assert "zero_counter" in otel_provider._counters
+
+    def test_record_count_with_large_value(self, otel_provider):
+        """Test that record_count works with large values."""
+        otel_provider.record_count("large_counter", 1_000_000.0)
+        
+        assert "large_counter" in otel_provider._counters
+
+    def test_record_histogram_with_negative_value(self, otel_provider):
+        """Test that record_histogram works with negative values."""
+        otel_provider.record_histogram("negative_histogram", -10.0)
+        
+        assert "negative_histogram" in otel_provider._histograms
+
+    def test_record_up_down_counter_with_negative_value(self, otel_provider):
+        """Test that record_up_down_counter works with negative values."""
+        otel_provider.record_up_down_counter("negative_updown", -5.0)
+        
+        assert "negative_updown" in otel_provider._up_down_counters
+
+    def test_metric_names_with_special_characters(self, otel_provider):
+        """Test that metric names with dots and underscores work."""
+        otel_provider.record_count("test.counter_name-special", 1.0)
+        otel_provider.record_histogram("test.histogram_name-special", 10.0)
+        
+        assert "test.counter_name-special" in otel_provider._counters
+        assert "test.histogram_name-special" in otel_provider._histograms
+
+    def test_empty_attributes_dict(self, otel_provider):
+        """Test that empty attributes dict is handled correctly."""
+        otel_provider.record_count("test_counter", 1.0, attributes={})
+        
+        assert "test_counter" in otel_provider._counters
+
+    def test_none_attributes(self, otel_provider):
+        """Test that None attributes are handled correctly."""
+        otel_provider.record_count("test_counter", 1.0, attributes=None)
+        
+        assert "test_counter" in otel_provider._counters
+