mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-06 20:44:58 +00:00
feat: add agent workflow metrics collection
Add comprehensive OpenTelemetry-based metrics for agent observability: - Workflow completion/failure tracking with duration measurements - Step execution counters for performance monitoring - Tool usage tracking with normalized tool names - Non-blocking telemetry emission with named async tasks - Comprehensive unit and integration test coverage - Graceful handling when telemetry is disabled
This commit is contained in:
parent
4c2fcb6b51
commit
69b692af91
13 changed files with 701 additions and 11 deletions
|
@ -4,17 +4,20 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
import secrets
|
||||
import string
|
||||
import time
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import httpx
|
||||
from opentelemetry.trace import get_current_span
|
||||
|
||||
from llama_stack.apis.agents import (
|
||||
AgentConfig,
|
||||
|
@ -60,6 +63,7 @@ from llama_stack.apis.inference import (
|
|||
UserMessage,
|
||||
)
|
||||
from llama_stack.apis.safety import Safety
|
||||
from llama_stack.apis.telemetry import MetricEvent, Telemetry
|
||||
from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
|
||||
from llama_stack.apis.vector_io import VectorIO
|
||||
from llama_stack.core.datatypes import AccessRule
|
||||
|
@ -97,6 +101,7 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
tool_runtime_api: ToolRuntime,
|
||||
tool_groups_api: ToolGroups,
|
||||
vector_io_api: VectorIO,
|
||||
telemetry_api: Telemetry | None,
|
||||
persistence_store: KVStore,
|
||||
created_at: str,
|
||||
policy: list[AccessRule],
|
||||
|
@ -106,6 +111,7 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
self.inference_api = inference_api
|
||||
self.safety_api = safety_api
|
||||
self.vector_io_api = vector_io_api
|
||||
self.telemetry_api = telemetry_api
|
||||
self.storage = AgentPersistence(agent_id, persistence_store, policy)
|
||||
self.tool_runtime_api = tool_runtime_api
|
||||
self.tool_groups_api = tool_groups_api
|
||||
|
@ -167,6 +173,43 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
async def create_session(self, name: str) -> str:
|
||||
return await self.storage.create_session(name)
|
||||
|
||||
def _emit_metric(
|
||||
self, metric_name: str, value: int | float, unit: str, attributes: dict[str, str] | None = None
|
||||
) -> None:
|
||||
"""Emit a single metric event"""
|
||||
if not self.telemetry_api:
|
||||
return
|
||||
|
||||
span = get_current_span()
|
||||
if not span:
|
||||
return
|
||||
|
||||
context = span.get_span_context()
|
||||
metric = MetricEvent(
|
||||
trace_id=format(context.trace_id, "x"),
|
||||
span_id=format(context.span_id, "x"),
|
||||
metric=metric_name,
|
||||
value=value,
|
||||
timestamp=time.time(),
|
||||
unit=unit,
|
||||
attributes={"agent_id": self.agent_id, **(attributes or {})},
|
||||
)
|
||||
|
||||
# Create task with name for better debugging and potential cleanup
|
||||
task_name = f"metric-{metric_name}-{self.agent_id}"
|
||||
asyncio.create_task(self.telemetry_api.log_event(metric), name=task_name)
|
||||
|
||||
def _track_step(self):
|
||||
self._emit_metric("llama_stack_agent_steps_total", 1, "1")
|
||||
|
||||
def _track_workflow(self, status: str, duration: float):
|
||||
self._emit_metric("llama_stack_agent_workflows_total", 1, "1", {"status": status})
|
||||
self._emit_metric("llama_stack_agent_workflow_duration_seconds", duration, "s")
|
||||
|
||||
def _track_tool(self, tool_name: str):
|
||||
normalized_name = "rag" if tool_name == "knowledge_search" else tool_name
|
||||
self._emit_metric("llama_stack_agent_tool_calls_total", 1, "1", {"tool": normalized_name})
|
||||
|
||||
async def get_messages_from_turns(self, turns: list[Turn]) -> list[Message]:
|
||||
messages = []
|
||||
if self.agent_config.instructions != "":
|
||||
|
@ -726,6 +769,9 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
)
|
||||
)
|
||||
|
||||
# Track step execution metric
|
||||
self._track_step()
|
||||
|
||||
# Add the result message to input_messages for the next iteration
|
||||
input_messages.append(result_message)
|
||||
|
||||
|
@ -900,6 +946,7 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
},
|
||||
)
|
||||
logger.debug(f"tool call {tool_name_str} completed with result: {result}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
|
|||
UserMessage,
|
||||
)
|
||||
from llama_stack.apis.safety import Safety
|
||||
from llama_stack.apis.telemetry import Telemetry
|
||||
from llama_stack.apis.tools import ToolGroups, ToolRuntime
|
||||
from llama_stack.apis.vector_io import VectorIO
|
||||
from llama_stack.core.datatypes import AccessRule
|
||||
|
@ -64,6 +65,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
tool_runtime_api: ToolRuntime,
|
||||
tool_groups_api: ToolGroups,
|
||||
policy: list[AccessRule],
|
||||
telemetry_api: Telemetry | None = None,
|
||||
):
|
||||
self.config = config
|
||||
self.inference_api = inference_api
|
||||
|
@ -71,6 +73,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
self.safety_api = safety_api
|
||||
self.tool_runtime_api = tool_runtime_api
|
||||
self.tool_groups_api = tool_groups_api
|
||||
self.telemetry_api = telemetry_api
|
||||
|
||||
self.in_memory_store = InmemoryKVStoreImpl()
|
||||
self.openai_responses_impl: OpenAIResponsesImpl | None = None
|
||||
|
@ -130,6 +133,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
vector_io_api=self.vector_io_api,
|
||||
tool_runtime_api=self.tool_runtime_api,
|
||||
tool_groups_api=self.tool_groups_api,
|
||||
telemetry_api=self.telemetry_api,
|
||||
persistence_store=(
|
||||
self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
|
||||
),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue