diff --git a/tests/integration/telemetry/collectors/base.py b/tests/integration/telemetry/collectors/base.py index a85e6cf3f..50168ebec 100644 --- a/tests/integration/telemetry/collectors/base.py +++ b/tests/integration/telemetry/collectors/base.py @@ -6,7 +6,7 @@ """Shared helpers for telemetry test collectors.""" -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from dataclasses import dataclass from typing import Any @@ -14,7 +14,7 @@ from typing import Any @dataclass class SpanStub: name: str - attributes: dict[str, Any] + attributes: Mapping[str, Any] | None = None resource_attributes: dict[str, Any] | None = None events: list[dict[str, Any]] | None = None trace_id: str | None = None diff --git a/tests/integration/telemetry/collectors/in_memory.py b/tests/integration/telemetry/collectors/in_memory.py index 2cf320f7b..a93d1ac4c 100644 --- a/tests/integration/telemetry/collectors/in_memory.py +++ b/tests/integration/telemetry/collectors/in_memory.py @@ -55,9 +55,12 @@ class InMemoryTelemetryCollector(BaseTelemetryCollector): def _snapshot_metrics(self) -> Any | None: data = self._metric_reader.get_metrics_data() if data and data.resource_metrics: - resource_metric = data.resource_metrics[0] - if resource_metric.scope_metrics: - return resource_metric.scope_metrics[0].metrics + all_metrics = [] + for resource_metric in data.resource_metrics: + if resource_metric.scope_metrics: + for scope_metric in resource_metric.scope_metrics: + all_metrics.extend(scope_metric.metrics) + return all_metrics if all_metrics else None return None def _clear_impl(self) -> None: diff --git a/tests/integration/telemetry/test_completions.py b/tests/integration/telemetry/test_completions.py index 5322f021a..aa16ccdb6 100644 --- a/tests/integration/telemetry/test_completions.py +++ b/tests/integration/telemetry/test_completions.py @@ -140,21 +140,26 @@ def test_telemetry_format_completeness(mock_otlp_collector, llama_stack_client, # At least one span should capture the fully qualified model ID assert text_model_id in logged_model_ids, f"Expected to find {text_model_id} in spans, but got {logged_model_ids}" - # TODO: re-enable this once metrics get fixed - """ # Verify token usage metrics in response metrics = mock_otlp_collector.get_metrics() - assert metrics + assert metrics, "Expected metrics to be generated" + + # Convert metrics to a dictionary for easier lookup + metrics_dict = {} for metric in metrics: - assert metric.name in ["completion_tokens", "total_tokens", "prompt_tokens"] - assert metric.unit == "tokens" - assert metric.data.data_points and len(metric.data.data_points) == 1 - match metric.name: - case "completion_tokens": - assert metric.data.data_points[0].value == usage["completion_tokens"] - case "total_tokens": - assert metric.data.data_points[0].value == usage["total_tokens"] - case "prompt_tokens": - assert metric.data.data_points[0].value == usage["prompt_tokens" - """ + if hasattr(metric, "name") and hasattr(metric, "data") and hasattr(metric.data, "data_points"): + if metric.data.data_points and len(metric.data.data_points) > 0: + # Get the value from the first data point + value = metric.data.data_points[0].value + metrics_dict[metric.name] = value + + # Verify expected metrics are present + expected_metrics = ["completion_tokens", "total_tokens", "prompt_tokens"] + for metric_name in expected_metrics: + assert metric_name in metrics_dict, f"Expected metric {metric_name} not found in {list(metrics_dict.keys())}" + + # Verify metric values match usage data + assert metrics_dict["completion_tokens"] == usage["completion_tokens"] + assert metrics_dict["total_tokens"] == usage["total_tokens"] + assert metrics_dict["prompt_tokens"] == usage["prompt_tokens"]