From d8177a2e4e28ee64acc81e16ca3458a11210b0c3 Mon Sep 17 00:00:00 2001 From: skamenan7 Date: Mon, 11 Aug 2025 18:58:50 -0400 Subject: [PATCH] improve metric task error handling --- .../agents/meta_reference/agent_instance.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 6ebbbcbc9..b597473bc 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -195,9 +195,22 @@ class ChatAgent(ShieldRunnerMixin): attributes={"agent_id": self.agent_id, **(attributes or {})}, ) - # Create task with name for better debugging and potential cleanup + # Create task with name for better debugging and capture any async errors task_name = f"metric-{metric_name}-{self.agent_id}" - asyncio.create_task(self.telemetry_api.log_event(metric), name=task_name) + task = asyncio.create_task(self.telemetry_api.log_event(metric), name=task_name) + + def _on_metric_task_done(t: asyncio.Task) -> None: + try: + exc = t.exception() + except asyncio.CancelledError: + logger.debug("Metric task %s was cancelled", task_name) + return + if exc is not None: + logger.warning("Metric task %s failed: %s", task_name, exc) + + # Only add callback if task creation succeeded (not None from mocking) + if task is not None: + task.add_done_callback(_on_metric_task_done) def _track_step(self): self._emit_metric("llama_stack_agent_steps_total", 1, "1")