mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
[Fix] Router cooldown logic - use % thresholds instead of allowed fails to cooldown deployments (#5698)
* move cooldown logic to it's own helper * add new track deployment metrics folder * increment success, fails for deployment in current minute * fix cooldown logic * fix test_aaarouter_dynamic_cooldown_message_retry_time * fix test_single_deployment_no_cooldowns_test_prod_mock_completion_calls * clean up get from deployment test * fix _async_get_healthy_deployments * add mock InternalServerError * test deployment failing 25% requests * add test_high_traffic_cooldowns_one_bad_deployment * fix vertex load test * add test for rate limit error models in cool down * change default cooldown time * fix cooldown message time * fix cooldown on 429 error * fix doc string for _should_cooldown_deployment * fix sync cooldown logic router
This commit is contained in:
parent
fc0dd3e3c2
commit
8f155327f6
11 changed files with 836 additions and 175 deletions
|
@ -28,6 +28,10 @@ from pydantic import BaseModel
|
|||
import litellm
|
||||
from litellm import Router
|
||||
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
||||
from litellm.router_utils.cooldown_handlers import (
|
||||
_async_get_cooldown_deployments,
|
||||
_get_cooldown_deployments,
|
||||
)
|
||||
from litellm.types.router import DeploymentTypedDict
|
||||
|
||||
load_dotenv()
|
||||
|
@ -2265,6 +2269,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
|||
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||
```
|
||||
"""
|
||||
litellm.set_verbose = True
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
|
@ -2279,7 +2284,9 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
|||
"model": "openai/text-embedding-ada-002",
|
||||
},
|
||||
},
|
||||
]
|
||||
],
|
||||
set_verbose=True,
|
||||
debug_level="DEBUG",
|
||||
)
|
||||
|
||||
openai_client = openai.OpenAI(api_key="")
|
||||
|
@ -2300,7 +2307,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
|||
"create",
|
||||
side_effect=_return_exception,
|
||||
):
|
||||
for _ in range(2):
|
||||
for _ in range(1):
|
||||
try:
|
||||
if sync_mode:
|
||||
router.embedding(
|
||||
|
@ -2318,9 +2325,13 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
|||
pass
|
||||
|
||||
if sync_mode:
|
||||
cooldown_deployments = router._get_cooldown_deployments()
|
||||
cooldown_deployments = _get_cooldown_deployments(
|
||||
litellm_router_instance=router
|
||||
)
|
||||
else:
|
||||
cooldown_deployments = await router._async_get_cooldown_deployments()
|
||||
cooldown_deployments = await _async_get_cooldown_deployments(
|
||||
litellm_router_instance=router
|
||||
)
|
||||
print(
|
||||
"Cooldown deployments - {}\n{}".format(
|
||||
cooldown_deployments, len(cooldown_deployments)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue