[Fix] Router cooldown logic - use % thresholds instead of allowed fails to cooldown deployments (#5698)

* move cooldown logic to it's own helper

* add new track deployment metrics folder

* increment success, fails for deployment in current minute

* fix cooldown logic

* fix test_aaarouter_dynamic_cooldown_message_retry_time

* fix test_single_deployment_no_cooldowns_test_prod_mock_completion_calls

* clean up get from deployment test

* fix _async_get_healthy_deployments

* add mock InternalServerError

* test deployment failing 25% requests

* add test_high_traffic_cooldowns_one_bad_deployment

* fix vertex load test

* add test for rate limit error models in cool down

* change default cooldown time

* fix cooldown message time

* fix cooldown on 429 error

* fix doc string for _should_cooldown_deployment

* fix sync cooldown logic router
This commit is contained in:
Ishaan Jaff 2024-09-14 18:01:19 -07:00 committed by GitHub
parent 7c2ddba6c6
commit c8d15544c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 836 additions and 175 deletions

View file

@ -0,0 +1,91 @@
"""
Helper functions to get/set num success and num failures per deployment
set_deployment_failures_for_current_minute
set_deployment_successes_for_current_minute
get_deployment_failures_for_current_minute
get_deployment_successes_for_current_minute
"""
from typing import TYPE_CHECKING, Any, Callable, Optional
from litellm.utils import get_utc_datetime
if TYPE_CHECKING:
from litellm.router import Router as _Router
LitellmRouter = _Router
else:
LitellmRouter = Any
def increment_deployment_successes_for_current_minute(
litellm_router_instance: LitellmRouter,
deployment_id: str,
):
"""
In-Memory: Increments the number of successes for the current minute for a deployment_id
"""
key = f"{deployment_id}:successes"
litellm_router_instance.cache.increment_cache(
local_only=True,
key=key,
value=1,
ttl=60,
)
def increment_deployment_failures_for_current_minute(
litellm_router_instance: LitellmRouter,
deployment_id: str,
):
"""
In-Memory: Increments the number of failures for the current minute for a deployment_id
"""
key = f"{deployment_id}:fails"
litellm_router_instance.cache.increment_cache(
local_only=True,
key=key,
value=1,
ttl=60,
)
def get_deployment_successes_for_current_minute(
litellm_router_instance: LitellmRouter,
deployment_id: str,
) -> int:
"""
Returns the number of successes for the current minute for a deployment_id
Returns 0 if no value found
"""
key = f"{deployment_id}:successes"
return (
litellm_router_instance.cache.get_cache(
local_only=True,
key=key,
)
or 0
)
def get_deployment_failures_for_current_minute(
litellm_router_instance: LitellmRouter,
deployment_id: str,
) -> int:
"""
Returns the number of fails for the current minute for a deployment_id
Returns 0 if no value found
"""
key = f"{deployment_id}:fails"
return (
litellm_router_instance.cache.get_cache(
local_only=True,
key=key,
)
or 0
)