feat(tests): metrics tests (#3966)

# What does this PR do? 1. Make telemetry tests as easy as possible for users by expanding the `SpanStub` data class and creating the `MetricStub` dataclass as a way to consistently marshal telemetry data in test fixtures and unmarshal and handle it in tests. 2. Structure server and client tests to always follow the same standards for consistent testing experience by using the `SpanStub` and `MetricStub` data class objects. 3. Enable Metrics Testing for completions endpoint 4. Correct token metrics to use histograms instead of counts to capture tokens per request rather than a cumulative count of tokens over the lifecycle of the server. ## Test Plan These are tests
2025-12-03 09:53:45 +00:00 · 2025-11-05 13:26:15 -05:00 · 2025-11-05 13:26:15 -05:00 · ba50790a28
commit ba50790a28
parent 2619f3552e
7 changed files with 647 additions and 130 deletions
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -427,6 +427,7 @@ _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "counters": {},
    "gauges": {},
    "up_down_counters": {},
+    "histograms": {},
 }
 _global_lock = threading.Lock()
 _TRACER_PROVIDER = None
@ -540,6 +541,16 @@ class Telemetry:
            )
        return cast(metrics.ObservableGauge, _GLOBAL_STORAGE["gauges"][name])

+    def _get_or_create_histogram(self, name: str, unit: str) -> metrics.Histogram:
+        assert self.meter is not None
+        if name not in _GLOBAL_STORAGE["histograms"]:
+            _GLOBAL_STORAGE["histograms"][name] = self.meter.create_histogram(
+                name=name,
+                unit=unit,
+                description=f"Histogram for {name}",
+            )
+        return cast(metrics.Histogram, _GLOBAL_STORAGE["histograms"][name])
+
    def _log_metric(self, event: MetricEvent) -> None:
        # Add metric as an event to the current span
        try:
@ -571,7 +582,16 @@ class Telemetry:
        # Log to OpenTelemetry meter if available
        if self.meter is None:
            return
-        if isinstance(event.value, int):
+
+        # Use histograms for token-related metrics (per-request measurements)
+        # Use counters for other cumulative metrics
+        token_metrics = {"prompt_tokens", "completion_tokens", "total_tokens"}
+
+        if event.metric in token_metrics:
+            # Token metrics are per-request measurements, use histogram
+            histogram = self._get_or_create_histogram(event.metric, event.unit)
+            histogram.record(event.value, attributes=_clean_attributes(event.attributes))
+        elif isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=_clean_attributes(event.attributes))
        elif isinstance(event.value, float):