feat(health_check.py): set upperbound for api when making health check call (#7865)

* feat(health_check.py): set upperbound for api when making health check call prevent bad model from health check to hang and cause pod restarts * fix(health_check.py): cleanup task once completed * fix(constants.py): bump default health check timeout to 1min * docs(health.md): add 'health_check_timeout' to health docs on litellm * build(proxy_server_config.yaml): add bad model to health check
2025-04-27 11:43:54 +00:00 · 2025-01-18 19:47:43 -08:00 · 2025-01-18 19:47:43 -08:00 · 3a7b13efa2
commit 3a7b13efa2
parent e67f18b153
9 changed files with 111 additions and 12 deletions
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -245,6 +245,22 @@ general_settings:
  health_check_details: False
 ```
 ## Health Check Timeout
 The health check timeout is set in `litellm/constants.py` and defaults to 60 seconds.
 This can be overridden in the config.yaml by setting `health_check_timeout` in the model_info section.
 ```yaml
 model_list:
  - model_name: openai/gpt-4o
    litellm_params:
      model: openai/gpt-4o
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      health_check_timeout: 10 # 👈 OVERRIDE HEALTH CHECK TIMEOUT
 ```
 ## `/health/readiness`
 Unprotected endpoint for checking if proxy is ready to accept requests
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -140,3 +140,5 @@ BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [
 BATCH_STATUS_POLL_INTERVAL_SECONDS = 3600  # 1 hour
 BATCH_STATUS_POLL_MAX_ATTEMPTS = 24  # for 24 hours
 HEALTH_CHECK_TIMEOUT_SECONDS = 60  # 60 seconds
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -6,8 +6,8 @@ model_list:
      api_base: https://exampleopenaiendpoint-production.up.railway.app
  - model_name: openai-o1
    litellm_params:
-      model: openai/o1
+      model: openai/random_sleep
      api_key: sk-1234
-      api_base: https://exampleopenaiendpoint-production.up.railway.app
+      api_base: http://0.0.0.0:8090
-      mock_timeout: true
+    model_info:
-      timeout: 3
+      health_check_timeout: 1
--- a/litellm/proxy/health_check.py
+++ b/litellm/proxy/health_check.py
@ -8,7 +8,7 @@ from typing import List, Optional
 import litellm
 logger = logging.getLogger(__name__)
-
+from litellm.constants import HEALTH_CHECK_TIMEOUT_SECONDS
 ILLEGAL_DISPLAY_PARAMS = [
    "messages",
@ -62,10 +62,28 @@ def filter_deployments_by_id(
    return filtered_deployments
 async def run_with_timeout(task, timeout):
    try:
        return await asyncio.wait_for(task, timeout)
    except asyncio.TimeoutError:
        task.cancel()
        # Only cancel child tasks of the current task
        current_task = asyncio.current_task()
        for t in asyncio.all_tasks():
            if t != current_task:
                t.cancel()
        try:
            await asyncio.wait_for(task, 0.1)  # Give 100ms for cleanup
        except (asyncio.TimeoutError, asyncio.CancelledError, Exception):
            pass
        return {"error": "Timeout exceeded"}
 async def _perform_health_check(model_list: list, details: Optional[bool] = True):
    """
    Perform a health check for each model in the list.
    """
    tasks = []
    for model in model_list:
        litellm_params = model["litellm_params"]
@ -74,16 +92,21 @@ async def _perform_health_check(model_list: list, details: Optional[bool] = True
        litellm_params = _update_litellm_params_for_health_check(
            model_info, litellm_params
        )
-        tasks.append(
+        timeout = model_info.get("health_check_timeout") or HEALTH_CHECK_TIMEOUT_SECONDS
        task = run_with_timeout(
            litellm.ahealth_check(
-                litellm_params,
+                model["litellm_params"],
                mode=mode,
                prompt="test from litellm",
                input=["test from litellm"],
-            )
+            ),
            timeout,
        )
-    results = await asyncio.gather(*tasks)
+        tasks.append(task)
    results = await asyncio.gather(*tasks, return_exceptions=True)
    healthy_endpoints = []
    unhealthy_endpoints = []
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -81,6 +81,16 @@ model_list:
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      stream_timeout: 0.001
      rpm: 1000
  - model_name: bad-model
    litellm_params:
      model: openai/bad-model
      api_key: os.environ/OPENAI_API_KEY
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      mock_timeout: True
      timeout: 60
      rpm: 1000
    model_info:
      health_check_timeout: 1
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/tests/local_testing/test_health_check.py
+++ b/tests/local_testing/test_health_check.py
@ -297,3 +297,54 @@ async def test_perform_health_check_with_health_check_model():
        assert healthy_endpoints[0]["model"] == "openai/gpt-4o-mini"
        assert len(healthy_endpoints) == 1
        assert len(unhealthy_endpoints) == 0
@pytest.mark.asyncio
 async def test_health_check_bad_model():
    from litellm.proxy.health_check import _perform_health_check
    import time
    model_list = [
        {
            "model_name": "openai-gpt-4o",
            "litellm_params": {
                "api_key": "sk-1234",
                "api_base": "https://exampleopenaiendpoint-production.up.railway.app",
                "model": "openai/my-fake-openai-endpoint",
                "mock_timeout": True,
                "timeout": 60,
            },
            "model_info": {
                "id": "ca27ca2eeea2f9e38bb274ead831948a26621a3738d06f1797253f0e6c4278c0",
                "db_model": False,
                "health_check_timeout": 1,
            },
        },
    ]
    details = None
    healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
        model_list, details
    )
    print(f"healthy_endpoints: {healthy_endpoints}")
    print(f"unhealthy_endpoints: {unhealthy_endpoints}")
    # Track which model is actually used in the health check
    health_check_calls = []
    async def mock_health_check(litellm_params, **kwargs):
        health_check_calls.append(litellm_params["model"])
        await asyncio.sleep(10)
        return {"status": "healthy"}
    with patch(
        "litellm.ahealth_check", side_effect=mock_health_check
    ) as mock_health_check:
        start_time = time.time()
        healthy_endpoints, unhealthy_endpoints = await _perform_health_check(model_list)
        end_time = time.time()
        print("health check calls: ", health_check_calls)
        assert len(healthy_endpoints) == 0
        assert len(unhealthy_endpoints) == 1
        assert (
            end_time - start_time < 2
        ), "Health check took longer than health_check_timeout"