feat: add agent workflow metrics collection

Add comprehensive OpenTelemetry-based metrics for agent observability:

- Workflow completion/failure tracking with duration measurements
- Step execution counters for performance monitoring
- Tool usage tracking with normalized tool names
- Non-blocking telemetry emission with named async tasks
- Comprehensive unit and integration test coverage
- Graceful handling when telemetry is disabled
This commit is contained in:
skamenan7 2025-08-06 17:08:03 -04:00
parent 4c2fcb6b51
commit 69b692af91
13 changed files with 701 additions and 11 deletions

View file

@ -4,17 +4,20 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import asyncio
import copy
import json
import re
import secrets
import string
import time
import uuid
import warnings
from collections.abc import AsyncGenerator
from datetime import UTC, datetime
import httpx
from opentelemetry.trace import get_current_span
from llama_stack.apis.agents import (
AgentConfig,
@ -60,6 +63,7 @@ from llama_stack.apis.inference import (
UserMessage,
)
from llama_stack.apis.safety import Safety
from llama_stack.apis.telemetry import MetricEvent, Telemetry
from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.datatypes import AccessRule
@ -97,6 +101,7 @@ class ChatAgent(ShieldRunnerMixin):
tool_runtime_api: ToolRuntime,
tool_groups_api: ToolGroups,
vector_io_api: VectorIO,
telemetry_api: Telemetry | None,
persistence_store: KVStore,
created_at: str,
policy: list[AccessRule],
@ -106,6 +111,7 @@ class ChatAgent(ShieldRunnerMixin):
self.inference_api = inference_api
self.safety_api = safety_api
self.vector_io_api = vector_io_api
self.telemetry_api = telemetry_api
self.storage = AgentPersistence(agent_id, persistence_store, policy)
self.tool_runtime_api = tool_runtime_api
self.tool_groups_api = tool_groups_api
@ -167,6 +173,43 @@ class ChatAgent(ShieldRunnerMixin):
async def create_session(self, name: str) -> str:
return await self.storage.create_session(name)
def _emit_metric(
self, metric_name: str, value: int | float, unit: str, attributes: dict[str, str] | None = None
) -> None:
"""Emit a single metric event"""
if not self.telemetry_api:
return
span = get_current_span()
if not span:
return
context = span.get_span_context()
metric = MetricEvent(
trace_id=format(context.trace_id, "x"),
span_id=format(context.span_id, "x"),
metric=metric_name,
value=value,
timestamp=time.time(),
unit=unit,
attributes={"agent_id": self.agent_id, **(attributes or {})},
)
# Create task with name for better debugging and potential cleanup
task_name = f"metric-{metric_name}-{self.agent_id}"
asyncio.create_task(self.telemetry_api.log_event(metric), name=task_name)
def _track_step(self):
self._emit_metric("llama_stack_agent_steps_total", 1, "1")
def _track_workflow(self, status: str, duration: float):
self._emit_metric("llama_stack_agent_workflows_total", 1, "1", {"status": status})
self._emit_metric("llama_stack_agent_workflow_duration_seconds", duration, "s")
def _track_tool(self, tool_name: str):
normalized_name = "rag" if tool_name == "knowledge_search" else tool_name
self._emit_metric("llama_stack_agent_tool_calls_total", 1, "1", {"tool": normalized_name})
async def get_messages_from_turns(self, turns: list[Turn]) -> list[Message]:
messages = []
if self.agent_config.instructions != "":
@ -726,6 +769,9 @@ class ChatAgent(ShieldRunnerMixin):
)
)
# Track step execution metric
self._track_step()
# Add the result message to input_messages for the next iteration
input_messages.append(result_message)
@ -900,6 +946,7 @@ class ChatAgent(ShieldRunnerMixin):
},
)
logger.debug(f"tool call {tool_name_str} completed with result: {result}")
return result

View file

@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
UserMessage,
)
from llama_stack.apis.safety import Safety
from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import ToolGroups, ToolRuntime
from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.datatypes import AccessRule
@ -64,6 +65,7 @@ class MetaReferenceAgentsImpl(Agents):
tool_runtime_api: ToolRuntime,
tool_groups_api: ToolGroups,
policy: list[AccessRule],
telemetry_api: Telemetry | None = None,
):
self.config = config
self.inference_api = inference_api
@ -71,6 +73,7 @@ class MetaReferenceAgentsImpl(Agents):
self.safety_api = safety_api
self.tool_runtime_api = tool_runtime_api
self.tool_groups_api = tool_groups_api
self.telemetry_api = telemetry_api
self.in_memory_store = InmemoryKVStoreImpl()
self.openai_responses_impl: OpenAIResponsesImpl | None = None
@ -130,6 +133,7 @@ class MetaReferenceAgentsImpl(Agents):
vector_io_api=self.vector_io_api,
tool_runtime_api=self.tool_runtime_api,
tool_groups_api=self.tool_groups_api,
telemetry_api=self.telemetry_api,
persistence_store=(
self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
),