fix(telemetry): add integration and unit tests for otel provider

This commit is contained in:
Emilio Garcia 2025-10-02 17:46:53 -04:00
parent e45e77f7b0
commit 9a0294ab4f
11 changed files with 1052 additions and 30 deletions

View file

@ -0,0 +1,368 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import concurrent.futures
import threading
from unittest.mock import MagicMock
import pytest
from llama_stack.providers.inline.telemetry.otel.config import OTelTelemetryConfig
from llama_stack.providers.inline.telemetry.otel.otel import OTelTelemetryProvider
@pytest.fixture
def otel_config():
"""Fixture providing a basic OTelTelemetryConfig."""
return OTelTelemetryConfig(
service_name="test-service",
service_version="1.0.0",
deployment_environment="test",
span_processor="simple",
)
@pytest.fixture
def otel_provider(otel_config, monkeypatch):
"""Fixture providing an OTelTelemetryProvider instance with mocked environment."""
# Set required environment variables to avoid warnings
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
return OTelTelemetryProvider(config=otel_config)
class TestOTelTelemetryProviderInitialization:
"""Tests for OTelTelemetryProvider initialization."""
def test_initialization_with_valid_config(self, otel_config, monkeypatch):
"""Test that provider initializes correctly with valid configuration."""
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
provider = OTelTelemetryProvider(config=otel_config)
assert provider.config == otel_config
assert hasattr(provider, "_lock")
assert provider._lock is not None
assert isinstance(provider._counters, dict)
assert isinstance(provider._histograms, dict)
assert isinstance(provider._up_down_counters, dict)
assert isinstance(provider._gauges, dict)
def test_initialization_sets_service_attributes(self, otel_config, monkeypatch):
"""Test that service attributes are properly configured."""
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
provider = OTelTelemetryProvider(config=otel_config)
assert provider.config.service_name == "test-service"
assert provider.config.service_version == "1.0.0"
assert provider.config.deployment_environment == "test"
def test_initialization_with_batch_processor(self, monkeypatch):
"""Test initialization with batch span processor."""
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
config = OTelTelemetryConfig(
service_name="test-service",
service_version="1.0.0",
deployment_environment="test",
span_processor="batch",
)
provider = OTelTelemetryProvider(config=config)
assert provider.config.span_processor == "batch"
def test_warns_when_endpoints_missing(self, otel_config, monkeypatch, caplog):
"""Test that warnings are issued when OTLP endpoints are not set."""
# Remove all endpoint environment variables
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
monkeypatch.delenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", raising=False)
monkeypatch.delenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", raising=False)
OTelTelemetryProvider(config=otel_config)
# Check that warnings were logged
assert any("Traces will not be exported" in record.message for record in caplog.records)
assert any("Metrics will not be exported" in record.message for record in caplog.records)
class TestOTelTelemetryProviderMetrics:
"""Tests for metric recording functionality."""
def test_record_count_creates_counter(self, otel_provider):
"""Test that record_count creates a counter on first call."""
assert "test_counter" not in otel_provider._counters
otel_provider.record_count("test_counter", 1.0)
assert "test_counter" in otel_provider._counters
assert otel_provider._counters["test_counter"] is not None
def test_record_count_reuses_counter(self, otel_provider):
"""Test that record_count reuses existing counter."""
otel_provider.record_count("test_counter", 1.0)
first_counter = otel_provider._counters["test_counter"]
otel_provider.record_count("test_counter", 2.0)
second_counter = otel_provider._counters["test_counter"]
assert first_counter is second_counter
assert len(otel_provider._counters) == 1
def test_record_count_with_attributes(self, otel_provider):
"""Test that record_count works with attributes."""
otel_provider.record_count(
"test_counter",
1.0,
attributes={"key": "value", "env": "test"}
)
assert "test_counter" in otel_provider._counters
def test_record_histogram_creates_histogram(self, otel_provider):
"""Test that record_histogram creates a histogram on first call."""
assert "test_histogram" not in otel_provider._histograms
otel_provider.record_histogram("test_histogram", 42.5)
assert "test_histogram" in otel_provider._histograms
assert otel_provider._histograms["test_histogram"] is not None
def test_record_histogram_reuses_histogram(self, otel_provider):
"""Test that record_histogram reuses existing histogram."""
otel_provider.record_histogram("test_histogram", 10.0)
first_histogram = otel_provider._histograms["test_histogram"]
otel_provider.record_histogram("test_histogram", 20.0)
second_histogram = otel_provider._histograms["test_histogram"]
assert first_histogram is second_histogram
assert len(otel_provider._histograms) == 1
def test_record_histogram_with_bucket_boundaries(self, otel_provider):
"""Test that record_histogram works with explicit bucket boundaries."""
boundaries = [0.0, 10.0, 50.0, 100.0]
otel_provider.record_histogram(
"test_histogram",
25.0,
explicit_bucket_boundaries_advisory=boundaries
)
assert "test_histogram" in otel_provider._histograms
def test_record_up_down_counter_creates_counter(self, otel_provider):
"""Test that record_up_down_counter creates a counter on first call."""
assert "test_updown" not in otel_provider._up_down_counters
otel_provider.record_up_down_counter("test_updown", 1.0)
assert "test_updown" in otel_provider._up_down_counters
assert otel_provider._up_down_counters["test_updown"] is not None
def test_record_up_down_counter_reuses_counter(self, otel_provider):
"""Test that record_up_down_counter reuses existing counter."""
otel_provider.record_up_down_counter("test_updown", 5.0)
first_counter = otel_provider._up_down_counters["test_updown"]
otel_provider.record_up_down_counter("test_updown", -3.0)
second_counter = otel_provider._up_down_counters["test_updown"]
assert first_counter is second_counter
assert len(otel_provider._up_down_counters) == 1
def test_multiple_metrics_with_different_names(self, otel_provider):
"""Test that multiple metrics with different names are cached separately."""
otel_provider.record_count("counter1", 1.0)
otel_provider.record_count("counter2", 2.0)
otel_provider.record_histogram("histogram1", 10.0)
otel_provider.record_up_down_counter("updown1", 5.0)
assert len(otel_provider._counters) == 2
assert len(otel_provider._histograms) == 1
assert len(otel_provider._up_down_counters) == 1
class TestOTelTelemetryProviderThreadSafety:
"""Tests for thread safety of metric operations."""
def test_concurrent_counter_creation_same_name(self, otel_provider):
"""Test that concurrent calls to record_count with same name are thread-safe."""
num_threads = 50
counter_name = "concurrent_counter"
def record_metric():
otel_provider.record_count(counter_name, 1.0)
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [executor.submit(record_metric) for _ in range(num_threads)]
concurrent.futures.wait(futures)
# Should have exactly one counter created despite concurrent access
assert len(otel_provider._counters) == 1
assert counter_name in otel_provider._counters
def test_concurrent_histogram_creation_same_name(self, otel_provider):
"""Test that concurrent calls to record_histogram with same name are thread-safe."""
num_threads = 50
histogram_name = "concurrent_histogram"
def record_metric():
thread_id = threading.current_thread().ident or 0
otel_provider.record_histogram(histogram_name, float(thread_id % 100))
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [executor.submit(record_metric) for _ in range(num_threads)]
concurrent.futures.wait(futures)
# Should have exactly one histogram created despite concurrent access
assert len(otel_provider._histograms) == 1
assert histogram_name in otel_provider._histograms
def test_concurrent_up_down_counter_creation_same_name(self, otel_provider):
"""Test that concurrent calls to record_up_down_counter with same name are thread-safe."""
num_threads = 50
counter_name = "concurrent_updown"
def record_metric():
otel_provider.record_up_down_counter(counter_name, 1.0)
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [executor.submit(record_metric) for _ in range(num_threads)]
concurrent.futures.wait(futures)
# Should have exactly one counter created despite concurrent access
assert len(otel_provider._up_down_counters) == 1
assert counter_name in otel_provider._up_down_counters
def test_concurrent_mixed_metrics_different_names(self, otel_provider):
"""Test concurrent creation of different metric types with different names."""
num_threads = 30
def record_counters(thread_id):
otel_provider.record_count(f"counter_{thread_id}", 1.0)
def record_histograms(thread_id):
otel_provider.record_histogram(f"histogram_{thread_id}", float(thread_id))
def record_up_down_counters(thread_id):
otel_provider.record_up_down_counter(f"updown_{thread_id}", float(thread_id))
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads * 3) as executor:
futures = []
for i in range(num_threads):
futures.append(executor.submit(record_counters, i))
futures.append(executor.submit(record_histograms, i))
futures.append(executor.submit(record_up_down_counters, i))
concurrent.futures.wait(futures)
# Each thread should have created its own metric
assert len(otel_provider._counters) == num_threads
assert len(otel_provider._histograms) == num_threads
assert len(otel_provider._up_down_counters) == num_threads
def test_concurrent_access_existing_and_new_metrics(self, otel_provider):
"""Test concurrent access mixing existing and new metric creation."""
# Pre-create some metrics
otel_provider.record_count("existing_counter", 1.0)
otel_provider.record_histogram("existing_histogram", 10.0)
num_threads = 40
def mixed_operations(thread_id):
# Half the threads use existing metrics, half create new ones
if thread_id % 2 == 0:
otel_provider.record_count("existing_counter", 1.0)
otel_provider.record_histogram("existing_histogram", float(thread_id))
else:
otel_provider.record_count(f"new_counter_{thread_id}", 1.0)
otel_provider.record_histogram(f"new_histogram_{thread_id}", float(thread_id))
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [executor.submit(mixed_operations, i) for i in range(num_threads)]
concurrent.futures.wait(futures)
# Should have existing metrics plus half of num_threads new ones
expected_new_counters = num_threads // 2
expected_new_histograms = num_threads // 2
assert len(otel_provider._counters) == 1 + expected_new_counters
assert len(otel_provider._histograms) == 1 + expected_new_histograms
class TestOTelTelemetryProviderTracing:
"""Tests for tracing functionality."""
def test_custom_trace_creates_span(self, otel_provider):
"""Test that custom_trace creates a span."""
span = otel_provider.custom_trace("test_span")
assert span is not None
assert hasattr(span, "get_span_context")
def test_custom_trace_with_attributes(self, otel_provider):
"""Test that custom_trace works with attributes."""
attributes = {"key": "value", "operation": "test"}
span = otel_provider.custom_trace("test_span", attributes=attributes)
assert span is not None
def test_fastapi_middleware(self, otel_provider):
"""Test that fastapi_middleware can be called."""
mock_app = MagicMock()
# Should not raise an exception
otel_provider.fastapi_middleware(mock_app)
class TestOTelTelemetryProviderEdgeCases:
"""Tests for edge cases and error conditions."""
def test_record_count_with_zero(self, otel_provider):
"""Test that record_count works with zero value."""
otel_provider.record_count("zero_counter", 0.0)
assert "zero_counter" in otel_provider._counters
def test_record_count_with_large_value(self, otel_provider):
"""Test that record_count works with large values."""
otel_provider.record_count("large_counter", 1_000_000.0)
assert "large_counter" in otel_provider._counters
def test_record_histogram_with_negative_value(self, otel_provider):
"""Test that record_histogram works with negative values."""
otel_provider.record_histogram("negative_histogram", -10.0)
assert "negative_histogram" in otel_provider._histograms
def test_record_up_down_counter_with_negative_value(self, otel_provider):
"""Test that record_up_down_counter works with negative values."""
otel_provider.record_up_down_counter("negative_updown", -5.0)
assert "negative_updown" in otel_provider._up_down_counters
def test_metric_names_with_special_characters(self, otel_provider):
"""Test that metric names with dots and underscores work."""
otel_provider.record_count("test.counter_name-special", 1.0)
otel_provider.record_histogram("test.histogram_name-special", 10.0)
assert "test.counter_name-special" in otel_provider._counters
assert "test.histogram_name-special" in otel_provider._histograms
def test_empty_attributes_dict(self, otel_provider):
"""Test that empty attributes dict is handled correctly."""
otel_provider.record_count("test_counter", 1.0, attributes={})
assert "test_counter" in otel_provider._counters
def test_none_attributes(self, otel_provider):
"""Test that None attributes are handled correctly."""
otel_provider.record_count("test_counter", 1.0, attributes=None)
assert "test_counter" in otel_provider._counters