feat(tests): metrics tests (#3966)

# What does this PR do?
1. Make telemetry tests as easy as possible for users by expanding the
`SpanStub` data class and creating the `MetricStub` dataclass as a way
to consistently marshal telemetry data in test fixtures and unmarshal
and handle it in tests.
2. Structure server and client tests to always follow the same standards
for consistent testing experience by using the `SpanStub` and
`MetricStub` data class objects.
3. Enable Metrics Testing for completions endpoint
4. Correct token metrics to use histograms instead of counts to capture
tokens per request rather than a cumulative count of tokens over the
lifecycle of the server.

## Test Plan
These are tests
This commit is contained in:
Emilio Garcia 2025-11-05 13:26:15 -05:00 committed by GitHub
parent 2619f3552e
commit ba50790a28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 647 additions and 130 deletions

View file

@ -6,20 +6,89 @@
"""Shared helpers for telemetry test collectors."""
import os
import time
from collections.abc import Iterable
from dataclasses import dataclass
from typing import Any
@dataclass
class SpanStub:
class MetricStub:
"""Unified metric interface for both in-memory and OTLP collectors."""
name: str
attributes: dict[str, Any]
value: Any
attributes: dict[str, Any] | None = None
@dataclass
class SpanStub:
"""Unified span interface for both in-memory and OTLP collectors."""
name: str
attributes: dict[str, Any] | None = None
resource_attributes: dict[str, Any] | None = None
events: list[dict[str, Any]] | None = None
trace_id: str | None = None
span_id: str | None = None
@property
def context(self):
"""Provide context-like interface for trace_id compatibility."""
if self.trace_id is None:
return None
return type("Context", (), {"trace_id": int(self.trace_id, 16)})()
def get_trace_id(self) -> str | None:
"""Get trace ID in hex format.
Tries context.trace_id first, then falls back to direct trace_id.
"""
context = getattr(self, "context", None)
if context and getattr(context, "trace_id", None) is not None:
return f"{context.trace_id:032x}"
return getattr(self, "trace_id", None)
def has_message(self, text: str) -> bool:
"""Check if span contains a specific message in its args."""
if self.attributes is None:
return False
args = self.attributes.get("__args__")
if not args or not isinstance(args, str):
return False
return text in args
def is_root_span(self) -> bool:
"""Check if this is a root span."""
if self.attributes is None:
return False
return self.attributes.get("__root__") is True
def is_autotraced(self) -> bool:
"""Check if this span was automatically traced."""
if self.attributes is None:
return False
return self.attributes.get("__autotraced__") is True
def get_span_type(self) -> str | None:
"""Get the span type (async, sync, async_generator)."""
if self.attributes is None:
return None
return self.attributes.get("__type__")
def get_class_method(self) -> tuple[str | None, str | None]:
"""Get the class and method names for autotraced spans."""
if self.attributes is None:
return None, None
return (self.attributes.get("__class__"), self.attributes.get("__method__"))
def get_location(self) -> str | None:
"""Get the location (library_client, server) for root spans."""
if self.attributes is None:
return None
return self.attributes.get("__location__")
def _value_to_python(value: Any) -> Any:
kind = value.WhichOneof("value")
@ -56,14 +125,65 @@ def events_to_list(events: Iterable[Any]) -> list[dict[str, Any]]:
class BaseTelemetryCollector:
"""Base class for telemetry collectors that ensures consistent return types.
All collectors must return SpanStub objects to ensure test compatibility
across both library-client and server modes.
"""
# Default delay in seconds if OTEL_METRIC_EXPORT_INTERVAL is not set
_DEFAULT_BASELINE_STABILIZATION_DELAY = 0.2
def __init__(self):
self._metric_baseline: dict[tuple[str, str], float] = {}
@classmethod
def _get_baseline_stabilization_delay(cls) -> float:
"""Get baseline stabilization delay from OTEL_METRIC_EXPORT_INTERVAL.
Adds 1.5x buffer for CI environments.
"""
interval_ms = os.environ.get("OTEL_METRIC_EXPORT_INTERVAL")
if interval_ms:
try:
delay = float(interval_ms) / 1000.0
except (ValueError, TypeError):
delay = cls._DEFAULT_BASELINE_STABILIZATION_DELAY
else:
delay = cls._DEFAULT_BASELINE_STABILIZATION_DELAY
if os.environ.get("CI"):
delay *= 1.5
return delay
def _get_metric_key(self, metric: MetricStub) -> tuple[str, str]:
"""Generate a stable key for a metric based on name and attributes."""
attrs = metric.attributes or {}
attr_key = ",".join(f"{k}={v}" for k, v in sorted(attrs.items()))
return (metric.name, attr_key)
def _compute_metric_delta(self, metric: MetricStub) -> int | float | None:
"""Compute delta value for a metric from baseline.
Returns:
Delta value if metric was in baseline, absolute value if new, None if unchanged.
"""
metric_key = self._get_metric_key(metric)
if metric_key in self._metric_baseline:
baseline_value = self._metric_baseline[metric_key]
delta = metric.value - baseline_value
return delta if delta > 0 else None
else:
return metric.value
def get_spans(
self,
expected_count: int | None = None,
timeout: float = 5.0,
poll_interval: float = 0.05,
) -> tuple[Any, ...]:
import time
) -> tuple[SpanStub, ...]:
deadline = time.time() + timeout
min_count = expected_count if expected_count is not None else 1
last_len: int | None = None
@ -91,16 +211,292 @@ class BaseTelemetryCollector:
last_len = len(spans)
time.sleep(poll_interval)
def get_metrics(self) -> Any | None:
return self._snapshot_metrics()
def get_metrics(
self,
expected_count: int | None = None,
timeout: float = 5.0,
poll_interval: float = 0.05,
expect_model_id: str | None = None,
) -> dict[str, MetricStub]:
"""Poll until expected metrics are available or timeout is reached.
Returns metrics with delta values computed from baseline.
"""
deadline = time.time() + timeout
min_count = expected_count if expected_count is not None else 1
accumulated_metrics = {}
seen_metric_names_with_model_id = set()
while time.time() < deadline:
current_metrics = self._snapshot_metrics()
if current_metrics:
for metric in current_metrics:
delta_value = self._compute_metric_delta(metric)
if delta_value is None:
continue
metric_with_delta = MetricStub(
name=metric.name,
value=delta_value,
attributes=metric.attributes,
)
self._accumulate_metric(
accumulated_metrics,
metric_with_delta,
expect_model_id,
seen_metric_names_with_model_id,
)
if self._has_enough_metrics(
accumulated_metrics, seen_metric_names_with_model_id, min_count, expect_model_id
):
return accumulated_metrics
time.sleep(poll_interval)
return accumulated_metrics
def _accumulate_metric(
self,
accumulated: dict[str, MetricStub],
metric: MetricStub,
expect_model_id: str | None,
seen_with_model_id: set[str],
) -> None:
"""Accumulate a metric, preferring those matching expected model_id."""
metric_name = metric.name
matches_model_id = (
expect_model_id and metric.attributes and metric.attributes.get("model_id") == expect_model_id
)
if metric_name not in accumulated:
accumulated[metric_name] = metric
if matches_model_id:
seen_with_model_id.add(metric_name)
return
existing = accumulated[metric_name]
existing_matches = (
expect_model_id and existing.attributes and existing.attributes.get("model_id") == expect_model_id
)
if matches_model_id and not existing_matches:
accumulated[metric_name] = metric
seen_with_model_id.add(metric_name)
elif matches_model_id == existing_matches:
if metric.value > existing.value:
accumulated[metric_name] = metric
if matches_model_id:
seen_with_model_id.add(metric_name)
def _has_enough_metrics(
self,
accumulated: dict[str, MetricStub],
seen_with_model_id: set[str],
min_count: int,
expect_model_id: str | None,
) -> bool:
"""Check if we have collected enough metrics."""
if len(accumulated) < min_count:
return False
if not expect_model_id:
return True
return len(seen_with_model_id) >= min_count
@staticmethod
def _convert_attributes_to_dict(attrs: Any) -> dict[str, Any]:
"""Convert various attribute types to a consistent dictionary format.
Handles mappingproxy, dict, and other attribute types.
"""
if attrs is None:
return {}
try:
return dict(attrs.items()) # type: ignore[attr-defined]
except AttributeError:
try:
return dict(attrs)
except TypeError:
return dict(attrs) if attrs else {}
@staticmethod
def _extract_trace_span_ids(span: Any) -> tuple[str | None, str | None]:
"""Extract trace_id and span_id from OpenTelemetry span object.
Handles both context-based and direct attribute access.
"""
trace_id = None
span_id = None
context = getattr(span, "context", None)
if context:
trace_id = f"{context.trace_id:032x}"
span_id = f"{context.span_id:016x}"
else:
trace_id = getattr(span, "trace_id", None)
span_id = getattr(span, "span_id", None)
return trace_id, span_id
@staticmethod
def _create_span_stub_from_opentelemetry(span: Any) -> SpanStub:
"""Create SpanStub from OpenTelemetry span object.
This helper reduces code duplication between collectors.
"""
trace_id, span_id = BaseTelemetryCollector._extract_trace_span_ids(span)
attributes = BaseTelemetryCollector._convert_attributes_to_dict(span.attributes) or {}
return SpanStub(
name=span.name,
attributes=attributes,
trace_id=trace_id,
span_id=span_id,
)
@staticmethod
def _create_span_stub_from_protobuf(span: Any, resource_attrs: dict[str, Any] | None = None) -> SpanStub:
"""Create SpanStub from protobuf span object.
This helper handles the different structure of protobuf spans.
"""
attributes = attributes_to_dict(span.attributes) or {}
events = events_to_list(span.events) if span.events else None
trace_id = span.trace_id.hex() if span.trace_id else None
span_id = span.span_id.hex() if span.span_id else None
return SpanStub(
name=span.name,
attributes=attributes,
resource_attributes=resource_attrs,
events=events,
trace_id=trace_id,
span_id=span_id,
)
@staticmethod
def _extract_metric_from_opentelemetry(metric: Any) -> MetricStub | None:
"""Extract MetricStub from OpenTelemetry metric object.
This helper reduces code duplication between collectors.
"""
if not (hasattr(metric, "name") and hasattr(metric, "data") and hasattr(metric.data, "data_points")):
return None
if not (metric.data.data_points and len(metric.data.data_points) > 0):
return None
data_point = metric.data.data_points[0]
if hasattr(data_point, "value"):
# Counter or Gauge
value = data_point.value
elif hasattr(data_point, "sum"):
# Histogram - use the sum of all recorded values
value = data_point.sum
else:
return None
attributes = {}
if hasattr(data_point, "attributes"):
attrs = data_point.attributes
if attrs is not None and hasattr(attrs, "items"):
attributes = dict(attrs.items())
elif attrs is not None and not isinstance(attrs, dict):
attributes = dict(attrs)
return MetricStub(
name=metric.name,
value=value,
attributes=attributes or {},
)
@staticmethod
def _create_metric_stubs_from_protobuf(metric: Any) -> list[MetricStub]:
"""Create list of MetricStub objects from protobuf metric object.
Protobuf metrics can have sum, gauge, or histogram data. Each metric can have
multiple data points with different attributes, so we return one MetricStub
per data point.
Returns:
List of MetricStub objects, one per data point in the metric.
"""
if not hasattr(metric, "name"):
return []
metric_stubs = []
for metric_type in ["sum", "gauge", "histogram"]:
if not hasattr(metric, metric_type):
continue
metric_data = getattr(metric, metric_type)
if not metric_data or not hasattr(metric_data, "data_points"):
continue
data_points = metric_data.data_points
if not data_points:
continue
for data_point in data_points:
attributes = attributes_to_dict(data_point.attributes) if hasattr(data_point, "attributes") else {}
value = BaseTelemetryCollector._extract_data_point_value(data_point, metric_type)
if value is None:
continue
metric_stubs.append(
MetricStub(
name=metric.name,
value=value,
attributes=attributes,
)
)
# Only process one metric type per metric
break
return metric_stubs
@staticmethod
def _extract_data_point_value(data_point: Any, metric_type: str) -> float | int | None:
"""Extract value from a protobuf metric data point based on metric type."""
if metric_type == "sum":
if hasattr(data_point, "as_int"):
return data_point.as_int
if hasattr(data_point, "as_double"):
return data_point.as_double
elif metric_type == "gauge":
if hasattr(data_point, "as_double"):
return data_point.as_double
elif metric_type == "histogram":
# Histograms use sum field which represents cumulative sum of all recorded values
if hasattr(data_point, "sum"):
return data_point.sum
return None
def clear(self) -> None:
"""Clear telemetry data and establish baseline for metric delta computation."""
self._metric_baseline.clear()
self._clear_impl()
def _snapshot_spans(self) -> tuple[Any, ...]: # pragma: no cover - interface hook
delay = self._get_baseline_stabilization_delay()
time.sleep(delay)
baseline_metrics = self._snapshot_metrics()
if baseline_metrics:
for metric in baseline_metrics:
metric_key = self._get_metric_key(metric)
self._metric_baseline[metric_key] = metric.value
def _snapshot_spans(self) -> tuple[SpanStub, ...]: # pragma: no cover - interface hook
raise NotImplementedError
def _snapshot_metrics(self) -> Any | None: # pragma: no cover - interface hook
def _snapshot_metrics(self) -> tuple[MetricStub, ...] | None: # pragma: no cover - interface hook
raise NotImplementedError
def _clear_impl(self) -> None: # pragma: no cover - interface hook

View file

@ -6,8 +6,6 @@
"""In-memory telemetry collector for library-client tests."""
from typing import Any
import opentelemetry.metrics as otel_metrics
import opentelemetry.trace as otel_trace
from opentelemetry import metrics, trace
@ -19,46 +17,42 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanE
import llama_stack.core.telemetry.telemetry as telemetry_module
from .base import BaseTelemetryCollector, SpanStub
from .base import BaseTelemetryCollector, MetricStub, SpanStub
class InMemoryTelemetryCollector(BaseTelemetryCollector):
"""In-memory telemetry collector for library-client tests.
Converts OpenTelemetry span objects to SpanStub objects to ensure
consistent interface with OTLP collector used in server mode.
"""
def __init__(self, span_exporter: InMemorySpanExporter, metric_reader: InMemoryMetricReader) -> None:
super().__init__()
self._span_exporter = span_exporter
self._metric_reader = metric_reader
def _snapshot_spans(self) -> tuple[Any, ...]:
def _snapshot_spans(self) -> tuple[SpanStub, ...]:
spans = []
for span in self._span_exporter.get_finished_spans():
trace_id = None
span_id = None
context = getattr(span, "context", None)
if context:
trace_id = f"{context.trace_id:032x}"
span_id = f"{context.span_id:016x}"
else:
trace_id = getattr(span, "trace_id", None)
span_id = getattr(span, "span_id", None)
stub = SpanStub(
span.name,
span.attributes,
getattr(span, "resource", None),
getattr(span, "events", None),
trace_id,
span_id,
)
spans.append(stub)
spans.append(self._create_span_stub_from_opentelemetry(span))
return tuple(spans)
def _snapshot_metrics(self) -> Any | None:
def _snapshot_metrics(self) -> tuple[MetricStub, ...] | None:
data = self._metric_reader.get_metrics_data()
if data and data.resource_metrics:
resource_metric = data.resource_metrics[0]
if not data or not data.resource_metrics:
return None
metric_stubs = []
for resource_metric in data.resource_metrics:
if resource_metric.scope_metrics:
return resource_metric.scope_metrics[0].metrics
return None
for scope_metric in resource_metric.scope_metrics:
for metric in scope_metric.metrics:
metric_stub = self._extract_metric_from_opentelemetry(metric)
if metric_stub:
metric_stubs.append(metric_stub)
return tuple(metric_stubs) if metric_stubs else None
def _clear_impl(self) -> None:
self._span_exporter.clear()

View file

@ -9,20 +9,21 @@
import gzip
import os
import threading
import time
from http.server import BaseHTTPRequestHandler, HTTPServer
from socketserver import ThreadingMixIn
from typing import Any
from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2 import ExportMetricsServiceRequest
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceRequest
from .base import BaseTelemetryCollector, SpanStub, attributes_to_dict, events_to_list
from .base import BaseTelemetryCollector, MetricStub, SpanStub, attributes_to_dict
class OtlpHttpTestCollector(BaseTelemetryCollector):
def __init__(self) -> None:
super().__init__()
self._spans: list[SpanStub] = []
self._metrics: list[Any] = []
self._metrics: list[MetricStub] = []
self._lock = threading.Lock()
class _ThreadingHTTPServer(ThreadingMixIn, HTTPServer):
@ -47,11 +48,7 @@ class OtlpHttpTestCollector(BaseTelemetryCollector):
for scope_spans in resource_spans.scope_spans:
for span in scope_spans.spans:
attributes = attributes_to_dict(span.attributes)
events = events_to_list(span.events) if span.events else None
trace_id = span.trace_id.hex() if span.trace_id else None
span_id = span.span_id.hex() if span.span_id else None
new_spans.append(SpanStub(span.name, attributes, resource_attrs or None, events, trace_id, span_id))
new_spans.append(self._create_span_stub_from_protobuf(span, resource_attrs or None))
if not new_spans:
return
@ -60,10 +57,13 @@ class OtlpHttpTestCollector(BaseTelemetryCollector):
self._spans.extend(new_spans)
def _handle_metrics(self, request: ExportMetricsServiceRequest) -> None:
new_metrics: list[Any] = []
new_metrics: list[MetricStub] = []
for resource_metrics in request.resource_metrics:
for scope_metrics in resource_metrics.scope_metrics:
new_metrics.extend(scope_metrics.metrics)
for metric in scope_metrics.metrics:
# Handle multiple data points per metric (e.g., different attribute sets)
metric_stubs = self._create_metric_stubs_from_protobuf(metric)
new_metrics.extend(metric_stubs)
if not new_metrics:
return
@ -75,11 +75,40 @@ class OtlpHttpTestCollector(BaseTelemetryCollector):
with self._lock:
return tuple(self._spans)
def _snapshot_metrics(self) -> Any | None:
def _snapshot_metrics(self) -> tuple[MetricStub, ...] | None:
with self._lock:
return list(self._metrics) if self._metrics else None
return tuple(self._metrics) if self._metrics else None
def _clear_impl(self) -> None:
"""Clear telemetry over a period of time to prevent race conditions between tests."""
with self._lock:
self._spans.clear()
self._metrics.clear()
# Prevent race conditions where telemetry arrives after clear() but before
# the test starts, causing contamination between tests
deadline = time.time() + 2.0 # Maximum wait time
last_span_count = 0
last_metric_count = 0
stable_iterations = 0
while time.time() < deadline:
with self._lock:
current_span_count = len(self._spans)
current_metric_count = len(self._metrics)
if current_span_count == last_span_count and current_metric_count == last_metric_count:
stable_iterations += 1
if stable_iterations >= 4: # 4 * 50ms = 200ms of stability
break
else:
stable_iterations = 0
last_span_count = current_span_count
last_metric_count = current_metric_count
time.sleep(0.05)
# Final clear to remove any telemetry that arrived during stabilization
with self._lock:
self._spans.clear()
self._metrics.clear()