mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
fix: show descriptive MCP server connection errors instead of generic 500s (#3256)
Some checks failed
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Test External API and Providers / test-external (venv) (push) Failing after 4s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
UI Tests / ui-tests (22) (push) Successful in 1m20s
Pre-commit / pre-commit (push) Successful in 2m37s
Some checks failed
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Test External API and Providers / test-external (venv) (push) Failing after 4s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
UI Tests / ui-tests (22) (push) Successful in 1m20s
Pre-commit / pre-commit (push) Successful in 2m37s
What does this PR do? Fixes error handling when MCP server connections fail. Instead of returning generic 500 errors, now provides descriptive error messages with proper HTTP status codes. Closes #3107 Test Plan Before fix: curl -X GET "http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server" Returns: {"detail": "Internal server error: An unexpected error occurred."} (500) After fix: curl -X GET "http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server" Returns: {"error": {"detail": "Failed to connect to MCP server at http://localhost:9999/sse: Connection refused"}} (502) Tests: - Added unit test for ConnectionError → 502 translation - Manually tested with unreachable MCP servers (connection refused)
This commit is contained in:
parent
561d2fc6b8
commit
55a8c5f439
3 changed files with 43 additions and 0 deletions
|
@ -141,6 +141,8 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
|
|||
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
|
||||
elif isinstance(exc, PermissionError | AccessDeniedError):
|
||||
return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
|
||||
elif isinstance(exc, ConnectionError | httpx.ConnectError):
|
||||
return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
|
||||
elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
|
||||
return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
|
||||
elif isinstance(exc, NotImplementedError):
|
||||
|
|
|
@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
|
|||
raise AuthenticationRequiredError(exc) from exc
|
||||
if i == len(connection_strategies) - 1:
|
||||
raise
|
||||
except* httpx.ConnectError as eg:
|
||||
# Connection refused, server down, network unreachable
|
||||
if i == len(connection_strategies) - 1:
|
||||
error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
|
||||
logger.error(f"MCP connection error: {error_msg}")
|
||||
raise ConnectionError(error_msg) from eg
|
||||
else:
|
||||
logger.warning(
|
||||
f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
|
||||
)
|
||||
except* httpx.TimeoutException as eg:
|
||||
# Request timeout, server too slow
|
||||
if i == len(connection_strategies) - 1:
|
||||
error_msg = f"MCP server at {endpoint} timed out"
|
||||
logger.error(f"MCP timeout error: {error_msg}")
|
||||
raise TimeoutError(error_msg) from eg
|
||||
else:
|
||||
logger.warning(
|
||||
f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
|
||||
)
|
||||
except* httpx.RequestError as eg:
|
||||
# DNS resolution failures, network errors, invalid URLs
|
||||
if i == len(connection_strategies) - 1:
|
||||
# Get the first exception's message for the error string
|
||||
exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
|
||||
error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
|
||||
logger.error(f"MCP network error: {error_msg}")
|
||||
raise ConnectionError(error_msg) from eg
|
||||
else:
|
||||
logger.warning(
|
||||
f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
|
||||
)
|
||||
except* McpError:
|
||||
if i < len(connection_strategies) - 1:
|
||||
logger.warning(
|
||||
|
|
|
@ -113,6 +113,15 @@ class TestTranslateException:
|
|||
assert result.status_code == 504
|
||||
assert result.detail == "Operation timed out: "
|
||||
|
||||
def test_translate_connection_error(self):
|
||||
"""Test that ConnectionError is translated to 502 HTTP status."""
|
||||
exc = ConnectionError("Failed to connect to MCP server at http://localhost:9999/sse: Connection refused")
|
||||
result = translate_exception(exc)
|
||||
|
||||
assert isinstance(result, HTTPException)
|
||||
assert result.status_code == 502
|
||||
assert result.detail == "Failed to connect to MCP server at http://localhost:9999/sse: Connection refused"
|
||||
|
||||
def test_translate_not_implemented_error(self):
|
||||
"""Test that NotImplementedError is translated to 501 HTTP status."""
|
||||
exc = NotImplementedError("Not implemented")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue