mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
feat: add agent workflow metrics collection
Add comprehensive OpenTelemetry-based metrics for agent observability: - Workflow completion/failure tracking with duration measurements - Step execution counters for performance monitoring - Tool usage tracking with normalized tool names - Non-blocking telemetry emission with named async tasks - Comprehensive unit and integration test coverage - Graceful handling when telemetry is disabled
This commit is contained in:
parent
4c2fcb6b51
commit
69b692af91
13 changed files with 701 additions and 11 deletions
|
@ -24,6 +24,7 @@ from llama_stack.apis.telemetry import (
|
|||
MetricEvent,
|
||||
MetricLabelMatcher,
|
||||
MetricQueryType,
|
||||
MetricType,
|
||||
QueryCondition,
|
||||
QueryMetricsResponse,
|
||||
QuerySpanTreeResponse,
|
||||
|
@ -56,6 +57,7 @@ _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
|
|||
"counters": {},
|
||||
"gauges": {},
|
||||
"up_down_counters": {},
|
||||
"histograms": {},
|
||||
}
|
||||
_global_lock = threading.Lock()
|
||||
_TRACER_PROVIDER = None
|
||||
|
@ -258,12 +260,20 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
|
|||
# Log to OpenTelemetry meter if available
|
||||
if self.meter is None:
|
||||
return
|
||||
if isinstance(event.value, int):
|
||||
counter = self._get_or_create_counter(event.metric, event.unit)
|
||||
counter.add(event.value, attributes=event.attributes)
|
||||
elif isinstance(event.value, float):
|
||||
|
||||
if event.metric_type == MetricType.HISTOGRAM:
|
||||
histogram = self._get_or_create_histogram(
|
||||
event.metric,
|
||||
event.unit,
|
||||
[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0],
|
||||
)
|
||||
histogram.record(event.value, attributes=event.attributes)
|
||||
elif event.metric_type == MetricType.UP_DOWN_COUNTER:
|
||||
up_down_counter = self._get_or_create_up_down_counter(event.metric, event.unit)
|
||||
up_down_counter.add(event.value, attributes=event.attributes)
|
||||
else:
|
||||
counter = self._get_or_create_counter(event.metric, event.unit)
|
||||
counter.add(event.value, attributes=event.attributes)
|
||||
|
||||
def _get_or_create_up_down_counter(self, name: str, unit: str) -> metrics.UpDownCounter:
|
||||
assert self.meter is not None
|
||||
|
@ -275,6 +285,16 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
|
|||
)
|
||||
return _GLOBAL_STORAGE["up_down_counters"][name]
|
||||
|
||||
def _get_or_create_histogram(self, name: str, unit: str, buckets: list[float] | None = None) -> metrics.Histogram:
|
||||
assert self.meter is not None
|
||||
if name not in _GLOBAL_STORAGE["histograms"]:
|
||||
_GLOBAL_STORAGE["histograms"][name] = self.meter.create_histogram(
|
||||
name=name,
|
||||
unit=unit,
|
||||
description=f"Histogram for {name}",
|
||||
)
|
||||
return _GLOBAL_STORAGE["histograms"][name]
|
||||
|
||||
def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None:
|
||||
with self._lock:
|
||||
span_id = int(event.span_id, 16)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue