[Fix] Router cooldown logic - use % thresholds instead of allowed fails to cooldown deployments (#5698)

* move cooldown logic to it's own helper

* add new track deployment metrics folder

* increment success, fails for deployment in current minute

* fix cooldown logic

* fix test_aaarouter_dynamic_cooldown_message_retry_time

* fix test_single_deployment_no_cooldowns_test_prod_mock_completion_calls

* clean up get from deployment test

* fix _async_get_healthy_deployments

* add mock InternalServerError

* test deployment failing 25% requests

* add test_high_traffic_cooldowns_one_bad_deployment

* fix vertex load test

* add test for rate limit error models in cool down

* change default cooldown time

* fix cooldown message time

* fix cooldown on 429 error

* fix doc string for _should_cooldown_deployment

* fix sync cooldown logic router
This commit is contained in:
Ishaan Jaff 2024-09-14 18:01:19 -07:00 committed by GitHub
parent fc0dd3e3c2
commit 8f155327f6
11 changed files with 836 additions and 175 deletions

View file

@ -28,6 +28,10 @@ from pydantic import BaseModel
import litellm
from litellm import Router
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
from litellm.router_utils.cooldown_handlers import (
_async_get_cooldown_deployments,
_get_cooldown_deployments,
)
from litellm.types.router import DeploymentTypedDict
load_dotenv()
@ -2265,6 +2269,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
```
"""
litellm.set_verbose = True
router = Router(
model_list=[
{
@ -2279,7 +2284,9 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
"model": "openai/text-embedding-ada-002",
},
},
]
],
set_verbose=True,
debug_level="DEBUG",
)
openai_client = openai.OpenAI(api_key="")
@ -2300,7 +2307,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
"create",
side_effect=_return_exception,
):
for _ in range(2):
for _ in range(1):
try:
if sync_mode:
router.embedding(
@ -2318,9 +2325,13 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
pass
if sync_mode:
cooldown_deployments = router._get_cooldown_deployments()
cooldown_deployments = _get_cooldown_deployments(
litellm_router_instance=router
)
else:
cooldown_deployments = await router._async_get_cooldown_deployments()
cooldown_deployments = await _async_get_cooldown_deployments(
litellm_router_instance=router
)
print(
"Cooldown deployments - {}\n{}".format(
cooldown_deployments, len(cooldown_deployments)