llama-stack-mirror/tests/unit/providers/telemetry/test_otel.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import concurrent.futures
import threading
from unittest.mock import MagicMock

import pytest

from llama_stack.providers.inline.telemetry.otel.config import OTelTelemetryConfig
from llama_stack.providers.inline.telemetry.otel.otel import OTelTelemetryProvider


@pytest.fixture
def otel_config():
    """Fixture providing a basic OTelTelemetryConfig."""
    return OTelTelemetryConfig(
        service_name="test-service",
        service_version="1.0.0",
        deployment_environment="test",
        span_processor="simple",
    )


@pytest.fixture
def otel_provider(otel_config, monkeypatch):
    """Fixture providing an OTelTelemetryProvider instance with mocked environment."""
    # Set required environment variables to avoid warnings
    monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
    return OTelTelemetryProvider(config=otel_config)


class TestOTelTelemetryProviderInitialization:
    """Tests for OTelTelemetryProvider initialization."""

    def test_initialization_with_valid_config(self, otel_config, monkeypatch):
        """Test that provider initializes correctly with valid configuration."""
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")

        provider = OTelTelemetryProvider(config=otel_config)

        assert provider.config == otel_config
        assert hasattr(provider, "_lock")
        assert provider._lock is not None
        assert isinstance(provider._counters, dict)
        assert isinstance(provider._histograms, dict)
        assert isinstance(provider._up_down_counters, dict)
        assert isinstance(provider._gauges, dict)

    def test_initialization_sets_service_attributes(self, otel_config, monkeypatch):
        """Test that service attributes are properly configured."""
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")

        provider = OTelTelemetryProvider(config=otel_config)

        assert provider.config.service_name == "test-service"
        assert provider.config.service_version == "1.0.0"
        assert provider.config.deployment_environment == "test"

    def test_initialization_with_batch_processor(self, monkeypatch):
        """Test initialization with batch span processor."""
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
        config = OTelTelemetryConfig(
            service_name="test-service",
            service_version="1.0.0",
            deployment_environment="test",
            span_processor="batch",
        )

        provider = OTelTelemetryProvider(config=config)

        assert provider.config.span_processor == "batch"

    def test_warns_when_endpoints_missing(self, otel_config, monkeypatch, caplog):
        """Test that warnings are issued when OTLP endpoints are not set."""
        # Remove all endpoint environment variables
        monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
        monkeypatch.delenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", raising=False)
        monkeypatch.delenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", raising=False)

        OTelTelemetryProvider(config=otel_config)

        # Check that warnings were logged
        assert any("Traces will not be exported" in record.message for record in caplog.records)
        assert any("Metrics will not be exported" in record.message for record in caplog.records)


class TestOTelTelemetryProviderMetrics:
    """Tests for metric recording functionality."""

    def test_record_count_creates_counter(self, otel_provider):
        """Test that record_count creates a counter on first call."""
        assert "test_counter" not in otel_provider._counters

        otel_provider.record_count("test_counter", 1.0)

        assert "test_counter" in otel_provider._counters
        assert otel_provider._counters["test_counter"] is not None

    def test_record_count_reuses_counter(self, otel_provider):
        """Test that record_count reuses existing counter."""
        otel_provider.record_count("test_counter", 1.0)
        first_counter = otel_provider._counters["test_counter"]

        otel_provider.record_count("test_counter", 2.0)
        second_counter = otel_provider._counters["test_counter"]

        assert first_counter is second_counter
        assert len(otel_provider._counters) == 1

    def test_record_count_with_attributes(self, otel_provider):
        """Test that record_count works with attributes."""
        otel_provider.record_count(
            "test_counter",
            1.0,
            attributes={"key": "value", "env": "test"}
        )

        assert "test_counter" in otel_provider._counters

    def test_record_histogram_creates_histogram(self, otel_provider):
        """Test that record_histogram creates a histogram on first call."""
        assert "test_histogram" not in otel_provider._histograms

        otel_provider.record_histogram("test_histogram", 42.5)

        assert "test_histogram" in otel_provider._histograms
        assert otel_provider._histograms["test_histogram"] is not None

    def test_record_histogram_reuses_histogram(self, otel_provider):
        """Test that record_histogram reuses existing histogram."""
        otel_provider.record_histogram("test_histogram", 10.0)
        first_histogram = otel_provider._histograms["test_histogram"]

        otel_provider.record_histogram("test_histogram", 20.0)
        second_histogram = otel_provider._histograms["test_histogram"]

        assert first_histogram is second_histogram
        assert len(otel_provider._histograms) == 1

    def test_record_histogram_with_bucket_boundaries(self, otel_provider):
        """Test that record_histogram works with explicit bucket boundaries."""
        boundaries = [0.0, 10.0, 50.0, 100.0]

        otel_provider.record_histogram(
            "test_histogram",
            25.0,
            explicit_bucket_boundaries_advisory=boundaries
        )

        assert "test_histogram" in otel_provider._histograms

    def test_record_up_down_counter_creates_counter(self, otel_provider):
        """Test that record_up_down_counter creates a counter on first call."""
        assert "test_updown" not in otel_provider._up_down_counters

        otel_provider.record_up_down_counter("test_updown", 1.0)

        assert "test_updown" in otel_provider._up_down_counters
        assert otel_provider._up_down_counters["test_updown"] is not None

    def test_record_up_down_counter_reuses_counter(self, otel_provider):
        """Test that record_up_down_counter reuses existing counter."""
        otel_provider.record_up_down_counter("test_updown", 5.0)
        first_counter = otel_provider._up_down_counters["test_updown"]

        otel_provider.record_up_down_counter("test_updown", -3.0)
        second_counter = otel_provider._up_down_counters["test_updown"]

        assert first_counter is second_counter
        assert len(otel_provider._up_down_counters) == 1

    def test_multiple_metrics_with_different_names(self, otel_provider):
        """Test that multiple metrics with different names are cached separately."""
        otel_provider.record_count("counter1", 1.0)
        otel_provider.record_count("counter2", 2.0)
        otel_provider.record_histogram("histogram1", 10.0)
        otel_provider.record_up_down_counter("updown1", 5.0)

        assert len(otel_provider._counters) == 2
        assert len(otel_provider._histograms) == 1
        assert len(otel_provider._up_down_counters) == 1


class TestOTelTelemetryProviderThreadSafety:
    """Tests for thread safety of metric operations."""

    def test_concurrent_counter_creation_same_name(self, otel_provider):
        """Test that concurrent calls to record_count with same name are thread-safe."""
        num_threads = 50
        counter_name = "concurrent_counter"

        def record_metric():
            otel_provider.record_count(counter_name, 1.0)

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(record_metric) for _ in range(num_threads)]
            concurrent.futures.wait(futures)

        # Should have exactly one counter created despite concurrent access
        assert len(otel_provider._counters) == 1
        assert counter_name in otel_provider._counters

    def test_concurrent_histogram_creation_same_name(self, otel_provider):
        """Test that concurrent calls to record_histogram with same name are thread-safe."""
        num_threads = 50
        histogram_name = "concurrent_histogram"

        def record_metric():
            thread_id = threading.current_thread().ident or 0
            otel_provider.record_histogram(histogram_name, float(thread_id % 100))

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(record_metric) for _ in range(num_threads)]
            concurrent.futures.wait(futures)

        # Should have exactly one histogram created despite concurrent access
        assert len(otel_provider._histograms) == 1
        assert histogram_name in otel_provider._histograms

    def test_concurrent_up_down_counter_creation_same_name(self, otel_provider):
        """Test that concurrent calls to record_up_down_counter with same name are thread-safe."""
        num_threads = 50
        counter_name = "concurrent_updown"

        def record_metric():
            otel_provider.record_up_down_counter(counter_name, 1.0)

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(record_metric) for _ in range(num_threads)]
            concurrent.futures.wait(futures)

        # Should have exactly one counter created despite concurrent access
        assert len(otel_provider._up_down_counters) == 1
        assert counter_name in otel_provider._up_down_counters

    def test_concurrent_mixed_metrics_different_names(self, otel_provider):
        """Test concurrent creation of different metric types with different names."""
        num_threads = 30

        def record_counters(thread_id):
            otel_provider.record_count(f"counter_{thread_id}", 1.0)

        def record_histograms(thread_id):
            otel_provider.record_histogram(f"histogram_{thread_id}", float(thread_id))

        def record_up_down_counters(thread_id):
            otel_provider.record_up_down_counter(f"updown_{thread_id}", float(thread_id))

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads * 3) as executor:
            futures = []
            for i in range(num_threads):
                futures.append(executor.submit(record_counters, i))
                futures.append(executor.submit(record_histograms, i))
                futures.append(executor.submit(record_up_down_counters, i))

            concurrent.futures.wait(futures)

        # Each thread should have created its own metric
        assert len(otel_provider._counters) == num_threads
        assert len(otel_provider._histograms) == num_threads
        assert len(otel_provider._up_down_counters) == num_threads

    def test_concurrent_access_existing_and_new_metrics(self, otel_provider):
        """Test concurrent access mixing existing and new metric creation."""
        # Pre-create some metrics
        otel_provider.record_count("existing_counter", 1.0)
        otel_provider.record_histogram("existing_histogram", 10.0)

        num_threads = 40

        def mixed_operations(thread_id):
            # Half the threads use existing metrics, half create new ones
            if thread_id % 2 == 0:
                otel_provider.record_count("existing_counter", 1.0)
                otel_provider.record_histogram("existing_histogram", float(thread_id))
            else:
                otel_provider.record_count(f"new_counter_{thread_id}", 1.0)
                otel_provider.record_histogram(f"new_histogram_{thread_id}", float(thread_id))

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(mixed_operations, i) for i in range(num_threads)]
            concurrent.futures.wait(futures)

        # Should have existing metrics plus half of num_threads new ones
        expected_new_counters = num_threads // 2
        expected_new_histograms = num_threads // 2

        assert len(otel_provider._counters) == 1 + expected_new_counters
        assert len(otel_provider._histograms) == 1 + expected_new_histograms


class TestOTelTelemetryProviderTracing:
    """Tests for tracing functionality."""

    def test_custom_trace_creates_span(self, otel_provider):
        """Test that custom_trace creates a span."""
        span = otel_provider.custom_trace("test_span")

        assert span is not None
        assert hasattr(span, "get_span_context")

    def test_custom_trace_with_attributes(self, otel_provider):
        """Test that custom_trace works with attributes."""
        attributes = {"key": "value", "operation": "test"}

        span = otel_provider.custom_trace("test_span", attributes=attributes)

        assert span is not None

    def test_fastapi_middleware(self, otel_provider):
        """Test that fastapi_middleware can be called."""
        mock_app = MagicMock()

        # Should not raise an exception
        otel_provider.fastapi_middleware(mock_app)


class TestOTelTelemetryProviderEdgeCases:
    """Tests for edge cases and error conditions."""

    def test_record_count_with_zero(self, otel_provider):
        """Test that record_count works with zero value."""
        otel_provider.record_count("zero_counter", 0.0)

        assert "zero_counter" in otel_provider._counters

    def test_record_count_with_large_value(self, otel_provider):
        """Test that record_count works with large values."""
        otel_provider.record_count("large_counter", 1_000_000.0)

        assert "large_counter" in otel_provider._counters

    def test_record_histogram_with_negative_value(self, otel_provider):
        """Test that record_histogram works with negative values."""
        otel_provider.record_histogram("negative_histogram", -10.0)

        assert "negative_histogram" in otel_provider._histograms

    def test_record_up_down_counter_with_negative_value(self, otel_provider):
        """Test that record_up_down_counter works with negative values."""
        otel_provider.record_up_down_counter("negative_updown", -5.0)

        assert "negative_updown" in otel_provider._up_down_counters

    def test_metric_names_with_special_characters(self, otel_provider):
        """Test that metric names with dots and underscores work."""
        otel_provider.record_count("test.counter_name-special", 1.0)
        otel_provider.record_histogram("test.histogram_name-special", 10.0)

        assert "test.counter_name-special" in otel_provider._counters
        assert "test.histogram_name-special" in otel_provider._histograms

    def test_empty_attributes_dict(self, otel_provider):
        """Test that empty attributes dict is handled correctly."""
        otel_provider.record_count("test_counter", 1.0, attributes={})

        assert "test_counter" in otel_provider._counters

    def test_none_attributes(self, otel_provider):
        """Test that None attributes are handled correctly."""
        otel_provider.record_count("test_counter", 1.0, attributes=None)

        assert "test_counter" in otel_provider._counters