From 7ca51640eb026bbc95fe8ed69e6a6f1cd29f3e2c Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Fri, 27 Jun 2025 11:43:39 -0400 Subject: [PATCH 1/3] fix: Gracefully handle errors when listing MCP tools When listing (and lazily indexing) tools, it's possible for an error to get thrown by individual toolgroups if for example an MCP toolgroup is unable to connect to its `mcp_endpoint`. This logs a warning in the server when that happens, logs a full stack trace of the error if debug logging is enabled, and just returns the list of tools from all working toolgroups instead of throwing an error to the client when a single toolgroup is temporarily or permanently misbehaving. Fixes #2540 A new unit test was added to test this exception handling, which is run as part of our regular test suite but also manually run to specifically verify this fix via: ``` uv run pytest -sv --asyncio-mode=auto \ tests/unit/distribution/routers/test_routing_tables.py ``` To verify the additional debug logging is printing properly: ``` LLAMA_STACK_LOGGING=core=debug \ uv run pytest -sv --asyncio-mode=auto \ tests/unit/distribution/routers/test_routing_tables.py ``` Signed-off-by: Ben Browning --- .../distribution/routing_tables/toolgroups.py | 11 ++++++--- .../routers/test_routing_tables.py | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/llama_stack/distribution/routing_tables/toolgroups.py b/llama_stack/distribution/routing_tables/toolgroups.py index b86f057bd..ac5e34783 100644 --- a/llama_stack/distribution/routing_tables/toolgroups.py +++ b/llama_stack/distribution/routing_tables/toolgroups.py @@ -52,9 +52,14 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): all_tools = [] for toolgroup in toolgroups: - if toolgroup.identifier not in self.toolgroups_to_tools: - await self._index_tools(toolgroup) - all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier]) + try: + if toolgroup.identifier not in self.toolgroups_to_tools: + await self._index_tools(toolgroup) + all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier]) + except Exception as e: + logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}") + logger.debug(e, exc_info=True) + continue return ListToolsResponse(data=all_tools) diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py index 0eeb68167..ec158de63 100644 --- a/tests/unit/distribution/routers/test_routing_tables.py +++ b/tests/unit/distribution/routers/test_routing_tables.py @@ -10,6 +10,7 @@ from unittest.mock import AsyncMock import pytest +from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import NumberType from llama_stack.apis.datasets.datasets import Dataset, DatasetPurpose, URIDataSource from llama_stack.apis.datatypes import Api @@ -296,3 +297,26 @@ async def test_tool_groups_routing_table(cached_disk_dist_registry): await table.unregister_toolgroup(toolgroup_id="test-toolgroup") tool_groups = await table.list_tool_groups() assert len(tool_groups.data) == 0 + + +@pytest.mark.asyncio +async def test_tool_groups_routing_table_exception_handling(cached_disk_dist_registry): + """Test that the tool group routing table handles exceptions when listing tools, like if an MCP server is unreachable.""" + + exception_throwing_tool_groups_impl = ToolGroupsImpl() + exception_throwing_tool_groups_impl.list_runtime_tools = AsyncMock(side_effect=Exception("Test exception")) + + table = ToolGroupsRoutingTable( + {"test_provider": exception_throwing_tool_groups_impl}, cached_disk_dist_registry, {} + ) + await table.initialize() + + await table.register_tool_group( + toolgroup_id="test-toolgroup-exceptions", + provider_id="test_provider", + mcp_endpoint=URL(uri="http://localhost:8479/foo/bar"), + ) + + tools = await table.list_tools(toolgroup_id="test-toolgroup-exceptions") + + assert len(tools.data) == 0 From 4867ed21b58f1028bb41b770484a092e536272d0 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Fri, 27 Jun 2025 12:45:59 -0400 Subject: [PATCH 2/3] Ensure clients see MCP auth errors Signed-off-by: Ben Browning --- llama_stack/distribution/routing_tables/toolgroups.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_stack/distribution/routing_tables/toolgroups.py b/llama_stack/distribution/routing_tables/toolgroups.py index ac5e34783..5934ced21 100644 --- a/llama_stack/distribution/routing_tables/toolgroups.py +++ b/llama_stack/distribution/routing_tables/toolgroups.py @@ -8,7 +8,7 @@ from typing import Any from llama_stack.apis.common.content_types import URL from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups -from llama_stack.distribution.datatypes import ToolGroupWithOwner +from llama_stack.distribution.datatypes import AuthenticationRequiredError, ToolGroupWithOwner from llama_stack.log import get_logger from .common import CommonRoutingTableImpl @@ -56,7 +56,13 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): if toolgroup.identifier not in self.toolgroups_to_tools: await self._index_tools(toolgroup) all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier]) + except AuthenticationRequiredError: + # Send authentication errors back to the client so it knows + # that it needs to supply credentials for remote MCP servers. + raise except Exception as e: + # Other errors that the client cannot fix are logged and + # those specific toolgroups are skipped. logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}") logger.debug(e, exc_info=True) continue From c5f65b08bba6cc58575ca9daf7aaf36f692bd733 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 10 Jul 2025 11:34:33 -0400 Subject: [PATCH 3/3] Narrow the scope of toolgroup exception catching Signed-off-by: Ben Browning --- .../distribution/routing_tables/toolgroups.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llama_stack/distribution/routing_tables/toolgroups.py b/llama_stack/distribution/routing_tables/toolgroups.py index 5934ced21..6a2e5a860 100644 --- a/llama_stack/distribution/routing_tables/toolgroups.py +++ b/llama_stack/distribution/routing_tables/toolgroups.py @@ -52,20 +52,20 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): all_tools = [] for toolgroup in toolgroups: - try: - if toolgroup.identifier not in self.toolgroups_to_tools: + if toolgroup.identifier not in self.toolgroups_to_tools: + try: await self._index_tools(toolgroup) - all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier]) - except AuthenticationRequiredError: - # Send authentication errors back to the client so it knows - # that it needs to supply credentials for remote MCP servers. - raise - except Exception as e: - # Other errors that the client cannot fix are logged and - # those specific toolgroups are skipped. - logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}") - logger.debug(e, exc_info=True) - continue + except AuthenticationRequiredError: + # Send authentication errors back to the client so it knows + # that it needs to supply credentials for remote MCP servers. + raise + except Exception as e: + # Other errors that the client cannot fix are logged and + # those specific toolgroups are skipped. + logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}") + logger.debug(e, exc_info=True) + continue + all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier]) return ListToolsResponse(data=all_tools)