fix: Gracefully handle errors when listing MCP tools

When listing (and lazily indexing) tools, it's possible for an error
to get thrown by individual toolgroups if for example an MCP toolgroup
is unable to connect to its `mcp_endpoint`.

This logs a warning in the server when that happens, logs a full stack
trace of the error if debug logging is enabled, and just returns the
list of tools from all working toolgroups instead of throwing an error
to the client when a single toolgroup is temporarily or permanently
misbehaving.

Fixes #2540

A new unit test was added to test this exception handling, which is
run as part of our regular test suite but also manually run to
specifically verify this fix via:

```
uv run pytest -sv --asyncio-mode=auto \
tests/unit/distribution/routers/test_routing_tables.py
```

To verify the additional debug logging is printing properly:

```
LLAMA_STACK_LOGGING=core=debug \
uv run pytest -sv --asyncio-mode=auto \
tests/unit/distribution/routers/test_routing_tables.py
```

Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
Ben Browning 2025-06-27 11:43:39 -04:00
parent b18f4d1ccf
commit 7ca51640eb
2 changed files with 32 additions and 3 deletions

View file

@ -52,9 +52,14 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
all_tools = []
for toolgroup in toolgroups:
if toolgroup.identifier not in self.toolgroups_to_tools:
await self._index_tools(toolgroup)
all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier])
try:
if toolgroup.identifier not in self.toolgroups_to_tools:
await self._index_tools(toolgroup)
all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier])
except Exception as e:
logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}")
logger.debug(e, exc_info=True)
continue
return ListToolsResponse(data=all_tools)

View file

@ -10,6 +10,7 @@ from unittest.mock import AsyncMock
import pytest
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import NumberType
from llama_stack.apis.datasets.datasets import Dataset, DatasetPurpose, URIDataSource
from llama_stack.apis.datatypes import Api
@ -296,3 +297,26 @@ async def test_tool_groups_routing_table(cached_disk_dist_registry):
await table.unregister_toolgroup(toolgroup_id="test-toolgroup")
tool_groups = await table.list_tool_groups()
assert len(tool_groups.data) == 0
@pytest.mark.asyncio
async def test_tool_groups_routing_table_exception_handling(cached_disk_dist_registry):
"""Test that the tool group routing table handles exceptions when listing tools, like if an MCP server is unreachable."""
exception_throwing_tool_groups_impl = ToolGroupsImpl()
exception_throwing_tool_groups_impl.list_runtime_tools = AsyncMock(side_effect=Exception("Test exception"))
table = ToolGroupsRoutingTable(
{"test_provider": exception_throwing_tool_groups_impl}, cached_disk_dist_registry, {}
)
await table.initialize()
await table.register_tool_group(
toolgroup_id="test-toolgroup-exceptions",
provider_id="test_provider",
mcp_endpoint=URL(uri="http://localhost:8479/foo/bar"),
)
tools = await table.list_tools(toolgroup_id="test-toolgroup-exceptions")
assert len(tools.data) == 0