feat(health_check.py): set upperbound for api when making health check call (#7865)
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 10s

* feat(health_check.py): set upperbound for api when making health check call

prevent bad model from health check to hang and cause pod restarts

* fix(health_check.py): cleanup task once completed

* fix(constants.py): bump default health check timeout to 1min

* docs(health.md): add 'health_check_timeout' to health docs on litellm

* build(proxy_server_config.yaml): add bad model to health check
This commit is contained in:
Krish Dholakia 2025-01-18 19:47:43 -08:00 committed by GitHub
parent e67f18b153
commit 3a7b13efa2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 111 additions and 12 deletions

View file

@ -245,6 +245,22 @@ general_settings:
health_check_details: False
```
## Health Check Timeout
The health check timeout is set in `litellm/constants.py` and defaults to 60 seconds.
This can be overridden in the config.yaml by setting `health_check_timeout` in the model_info section.
```yaml
model_list:
- model_name: openai/gpt-4o
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
model_info:
health_check_timeout: 10 # 👈 OVERRIDE HEALTH CHECK TIMEOUT
```
## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests

View file

@ -140,3 +140,5 @@ BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [
BATCH_STATUS_POLL_INTERVAL_SECONDS = 3600 # 1 hour
BATCH_STATUS_POLL_MAX_ATTEMPTS = 24 # for 24 hours
HEALTH_CHECK_TIMEOUT_SECONDS = 60 # 60 seconds

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -6,8 +6,8 @@ model_list:
api_base: https://exampleopenaiendpoint-production.up.railway.app
- model_name: openai-o1
litellm_params:
model: openai/o1
model: openai/random_sleep
api_key: sk-1234
api_base: https://exampleopenaiendpoint-production.up.railway.app
mock_timeout: true
timeout: 3
api_base: http://0.0.0.0:8090
model_info:
health_check_timeout: 1

View file

@ -8,7 +8,7 @@ from typing import List, Optional
import litellm
logger = logging.getLogger(__name__)
from litellm.constants import HEALTH_CHECK_TIMEOUT_SECONDS
ILLEGAL_DISPLAY_PARAMS = [
"messages",
@ -62,10 +62,28 @@ def filter_deployments_by_id(
return filtered_deployments
async def run_with_timeout(task, timeout):
try:
return await asyncio.wait_for(task, timeout)
except asyncio.TimeoutError:
task.cancel()
# Only cancel child tasks of the current task
current_task = asyncio.current_task()
for t in asyncio.all_tasks():
if t != current_task:
t.cancel()
try:
await asyncio.wait_for(task, 0.1) # Give 100ms for cleanup
except (asyncio.TimeoutError, asyncio.CancelledError, Exception):
pass
return {"error": "Timeout exceeded"}
async def _perform_health_check(model_list: list, details: Optional[bool] = True):
"""
Perform a health check for each model in the list.
"""
tasks = []
for model in model_list:
litellm_params = model["litellm_params"]
@ -74,16 +92,21 @@ async def _perform_health_check(model_list: list, details: Optional[bool] = True
litellm_params = _update_litellm_params_for_health_check(
model_info, litellm_params
)
tasks.append(
timeout = model_info.get("health_check_timeout") or HEALTH_CHECK_TIMEOUT_SECONDS
task = run_with_timeout(
litellm.ahealth_check(
litellm_params,
model["litellm_params"],
mode=mode,
prompt="test from litellm",
input=["test from litellm"],
)
),
timeout,
)
results = await asyncio.gather(*tasks)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
healthy_endpoints = []
unhealthy_endpoints = []

View file

@ -81,6 +81,16 @@ model_list:
api_base: https://exampleopenaiendpoint-production.up.railway.app/
stream_timeout: 0.001
rpm: 1000
- model_name: bad-model
litellm_params:
model: openai/bad-model
api_key: os.environ/OPENAI_API_KEY
api_base: https://exampleopenaiendpoint-production.up.railway.app/
mock_timeout: True
timeout: 60
rpm: 1000
model_info:
health_check_timeout: 1
- model_name: "*"
litellm_params:
model: openai/*

View file

@ -297,3 +297,54 @@ async def test_perform_health_check_with_health_check_model():
assert healthy_endpoints[0]["model"] == "openai/gpt-4o-mini"
assert len(healthy_endpoints) == 1
assert len(unhealthy_endpoints) == 0
@pytest.mark.asyncio
async def test_health_check_bad_model():
from litellm.proxy.health_check import _perform_health_check
import time
model_list = [
{
"model_name": "openai-gpt-4o",
"litellm_params": {
"api_key": "sk-1234",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app",
"model": "openai/my-fake-openai-endpoint",
"mock_timeout": True,
"timeout": 60,
},
"model_info": {
"id": "ca27ca2eeea2f9e38bb274ead831948a26621a3738d06f1797253f0e6c4278c0",
"db_model": False,
"health_check_timeout": 1,
},
},
]
details = None
healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
model_list, details
)
print(f"healthy_endpoints: {healthy_endpoints}")
print(f"unhealthy_endpoints: {unhealthy_endpoints}")
# Track which model is actually used in the health check
health_check_calls = []
async def mock_health_check(litellm_params, **kwargs):
health_check_calls.append(litellm_params["model"])
await asyncio.sleep(10)
return {"status": "healthy"}
with patch(
"litellm.ahealth_check", side_effect=mock_health_check
) as mock_health_check:
start_time = time.time()
healthy_endpoints, unhealthy_endpoints = await _perform_health_check(model_list)
end_time = time.time()
print("health check calls: ", health_check_calls)
assert len(healthy_endpoints) == 0
assert len(unhealthy_endpoints) == 1
assert (
end_time - start_time < 2
), "Health check took longer than health_check_timeout"