feat(health_check.py): set upperbound for api when making health check call (#7865)

* feat(health_check.py): set upperbound for api when making health check call prevent bad model from health check to hang and cause pod restarts * fix(health_check.py): cleanup task once completed * fix(constants.py): bump default health check timeout to 1min * docs(health.md): add 'health_check_timeout' to health docs on litellm * build(proxy_server_config.yaml): add bad model to health check
2025-04-27 03:34:10 +00:00 · 2025-01-18 19:47:43 -08:00 · 2025-01-18 19:47:43 -08:00 · 3a7b13efa2
commit 3a7b13efa2
parent e67f18b153
9 changed files with 111 additions and 12 deletions
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -245,6 +245,22 @@ general_settings:
  health_check_details: False
 ```

+## Health Check Timeout
+
+The health check timeout is set in `litellm/constants.py` and defaults to 60 seconds.
+
+This can be overridden in the config.yaml by setting `health_check_timeout` in the model_info section.
+
+```yaml
+model_list:
+  - model_name: openai/gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      health_check_timeout: 10 # 👈 OVERRIDE HEALTH CHECK TIMEOUT
+```
+
 ## `/health/readiness`

 Unprotected endpoint for checking if proxy is ready to accept requests
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -140,3 +140,5 @@ BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [

 BATCH_STATUS_POLL_INTERVAL_SECONDS = 3600  # 1 hour
 BATCH_STATUS_POLL_MAX_ATTEMPTS = 24  # for 24 hours
+
+HEALTH_CHECK_TIMEOUT_SECONDS = 60  # 60 seconds
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -6,8 +6,8 @@ model_list:
      api_base: https://exampleopenaiendpoint-production.up.railway.app
  - model_name: openai-o1
    litellm_params:
-      model: openai/o1
+      model: openai/random_sleep
      api_key: sk-1234
-      api_base: https://exampleopenaiendpoint-production.up.railway.app
-      mock_timeout: true
-      timeout: 3
+      api_base: http://0.0.0.0:8090
+    model_info:
+      health_check_timeout: 1
--- a/litellm/proxy/health_check.py
+++ b/litellm/proxy/health_check.py
@ -8,7 +8,7 @@ from typing import List, Optional
 import litellm

 logger = logging.getLogger(__name__)
-
+from litellm.constants import HEALTH_CHECK_TIMEOUT_SECONDS

 ILLEGAL_DISPLAY_PARAMS = [
    "messages",
@ -62,10 +62,28 @@ def filter_deployments_by_id(
    return filtered_deployments


+async def run_with_timeout(task, timeout):
+    try:
+        return await asyncio.wait_for(task, timeout)
+    except asyncio.TimeoutError:
+        task.cancel()
+        # Only cancel child tasks of the current task
+        current_task = asyncio.current_task()
+        for t in asyncio.all_tasks():
+            if t != current_task:
+                t.cancel()
+        try:
+            await asyncio.wait_for(task, 0.1)  # Give 100ms for cleanup
+        except (asyncio.TimeoutError, asyncio.CancelledError, Exception):
+            pass
+        return {"error": "Timeout exceeded"}
+
+
 async def _perform_health_check(model_list: list, details: Optional[bool] = True):
    """
    Perform a health check for each model in the list.
    """
+
    tasks = []
    for model in model_list:
        litellm_params = model["litellm_params"]
@ -74,16 +92,21 @@ async def _perform_health_check(model_list: list, details: Optional[bool] = True
        litellm_params = _update_litellm_params_for_health_check(
            model_info, litellm_params
        )
-        tasks.append(
+        timeout = model_info.get("health_check_timeout") or HEALTH_CHECK_TIMEOUT_SECONDS
+
+        task = run_with_timeout(
            litellm.ahealth_check(
-                litellm_params,
+                model["litellm_params"],
                mode=mode,
                prompt="test from litellm",
                input=["test from litellm"],
-            )
+            ),
+            timeout,
        )

-    results = await asyncio.gather(*tasks)
+        tasks.append(task)
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)

    healthy_endpoints = []
    unhealthy_endpoints = []
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -81,6 +81,16 @@ model_list:
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      stream_timeout: 0.001
      rpm: 1000
+  - model_name: bad-model
+    litellm_params:
+      model: openai/bad-model
+      api_key: os.environ/OPENAI_API_KEY
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+      mock_timeout: True
+      timeout: 60
+      rpm: 1000
+    model_info:
+      health_check_timeout: 1
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/tests/local_testing/test_health_check.py
+++ b/tests/local_testing/test_health_check.py
@ -297,3 +297,54 @@ async def test_perform_health_check_with_health_check_model():
        assert healthy_endpoints[0]["model"] == "openai/gpt-4o-mini"
        assert len(healthy_endpoints) == 1
        assert len(unhealthy_endpoints) == 0
+
+
+@pytest.mark.asyncio
+async def test_health_check_bad_model():
+    from litellm.proxy.health_check import _perform_health_check
+    import time
+
+    model_list = [
+        {
+            "model_name": "openai-gpt-4o",
+            "litellm_params": {
+                "api_key": "sk-1234",
+                "api_base": "https://exampleopenaiendpoint-production.up.railway.app",
+                "model": "openai/my-fake-openai-endpoint",
+                "mock_timeout": True,
+                "timeout": 60,
+            },
+            "model_info": {
+                "id": "ca27ca2eeea2f9e38bb274ead831948a26621a3738d06f1797253f0e6c4278c0",
+                "db_model": False,
+                "health_check_timeout": 1,
+            },
+        },
+    ]
+    details = None
+    healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
+        model_list, details
+    )
+    print(f"healthy_endpoints: {healthy_endpoints}")
+    print(f"unhealthy_endpoints: {unhealthy_endpoints}")
+
+    # Track which model is actually used in the health check
+    health_check_calls = []
+
+    async def mock_health_check(litellm_params, **kwargs):
+        health_check_calls.append(litellm_params["model"])
+        await asyncio.sleep(10)
+        return {"status": "healthy"}
+
+    with patch(
+        "litellm.ahealth_check", side_effect=mock_health_check
+    ) as mock_health_check:
+        start_time = time.time()
+        healthy_endpoints, unhealthy_endpoints = await _perform_health_check(model_list)
+        end_time = time.time()
+        print("health check calls: ", health_check_calls)
+        assert len(healthy_endpoints) == 0
+        assert len(unhealthy_endpoints) == 1
+        assert (
+            end_time - start_time < 2
+        ), "Health check took longer than health_check_timeout"