fix: show descriptive MCP server connection errors instead of generic 500s (#3256)
Some checks failed
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Test External API and Providers / test-external (venv) (push) Failing after 4s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
UI Tests / ui-tests (22) (push) Successful in 1m20s
Pre-commit / pre-commit (push) Successful in 2m37s

What does this PR do?

Fixes error handling when MCP server connections fail. Instead of
returning generic 500 errors, now provides
   descriptive error messages with proper HTTP status codes.

  Closes #3107

  Test Plan

  Before fix:
curl -X GET
"http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server"
Returns: {"detail": "Internal server error: An unexpected error
occurred."} (500)

  After fix:
curl -X GET
"http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server"
Returns: {"error": {"detail": "Failed to connect to MCP server at
http://localhost:9999/sse: Connection
  refused"}} (502)

  Tests:
  - Added unit test for ConnectionError → 502 translation
  - Manually tested with unreachable MCP servers (connection refused)
This commit is contained in:
Sumanth Kamenani 2025-09-04 16:25:02 -04:00 committed by GitHub
parent 561d2fc6b8
commit 55a8c5f439
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 43 additions and 0 deletions

View file

@ -141,6 +141,8 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc)) return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
elif isinstance(exc, PermissionError | AccessDeniedError): elif isinstance(exc, PermissionError | AccessDeniedError):
return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}") return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
elif isinstance(exc, ConnectionError | httpx.ConnectError):
return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
elif isinstance(exc, asyncio.TimeoutError | TimeoutError): elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}") return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
elif isinstance(exc, NotImplementedError): elif isinstance(exc, NotImplementedError):

View file

@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
raise AuthenticationRequiredError(exc) from exc raise AuthenticationRequiredError(exc) from exc
if i == len(connection_strategies) - 1: if i == len(connection_strategies) - 1:
raise raise
except* httpx.ConnectError as eg:
# Connection refused, server down, network unreachable
if i == len(connection_strategies) - 1:
error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
logger.error(f"MCP connection error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.TimeoutException as eg:
# Request timeout, server too slow
if i == len(connection_strategies) - 1:
error_msg = f"MCP server at {endpoint} timed out"
logger.error(f"MCP timeout error: {error_msg}")
raise TimeoutError(error_msg) from eg
else:
logger.warning(
f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.RequestError as eg:
# DNS resolution failures, network errors, invalid URLs
if i == len(connection_strategies) - 1:
# Get the first exception's message for the error string
exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
logger.error(f"MCP network error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* McpError: except* McpError:
if i < len(connection_strategies) - 1: if i < len(connection_strategies) - 1:
logger.warning( logger.warning(

View file

@ -113,6 +113,15 @@ class TestTranslateException:
assert result.status_code == 504 assert result.status_code == 504
assert result.detail == "Operation timed out: " assert result.detail == "Operation timed out: "
def test_translate_connection_error(self):
"""Test that ConnectionError is translated to 502 HTTP status."""
exc = ConnectionError("Failed to connect to MCP server at http://localhost:9999/sse: Connection refused")
result = translate_exception(exc)
assert isinstance(result, HTTPException)
assert result.status_code == 502
assert result.detail == "Failed to connect to MCP server at http://localhost:9999/sse: Connection refused"
def test_translate_not_implemented_error(self): def test_translate_not_implemented_error(self):
"""Test that NotImplementedError is translated to 501 HTTP status.""" """Test that NotImplementedError is translated to 501 HTTP status."""
exc = NotImplementedError("Not implemented") exc = NotImplementedError("Not implemented")