[Fix] Router cooldown logic - use % thresholds instead of allowed fails to cooldown deployments (#5698)

* move cooldown logic to it's own helper

* add new track deployment metrics folder

* increment success, fails for deployment in current minute

* fix cooldown logic

* fix test_aaarouter_dynamic_cooldown_message_retry_time

* fix test_single_deployment_no_cooldowns_test_prod_mock_completion_calls

* clean up get from deployment test

* fix _async_get_healthy_deployments

* add mock InternalServerError

* test deployment failing 25% requests

* add test_high_traffic_cooldowns_one_bad_deployment

* fix vertex load test

* add test for rate limit error models in cool down

* change default cooldown time

* fix cooldown message time

* fix cooldown on 429 error

* fix doc string for _should_cooldown_deployment

* fix sync cooldown logic router
This commit is contained in:
Ishaan Jaff 2024-09-14 18:01:19 -07:00 committed by GitHub
parent fc0dd3e3c2
commit 8f155327f6
11 changed files with 836 additions and 175 deletions

View file

@ -83,7 +83,7 @@ class CooldownCache:
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
# Retrieve the values for the keys using mget
results = await self.cache.async_batch_get_cache(keys=keys)
results = await self.cache.async_batch_get_cache(keys=keys) or []
active_cooldowns = []
# Process the results
@ -101,7 +101,7 @@ class CooldownCache:
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
# Retrieve the values for the keys using mget
results = self.cache.batch_get_cache(keys=keys)
results = self.cache.batch_get_cache(keys=keys) or []
active_cooldowns = []
# Process the results
@ -119,17 +119,19 @@ class CooldownCache:
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
# Retrieve the values for the keys using mget
results = self.cache.batch_get_cache(keys=keys)
results = self.cache.batch_get_cache(keys=keys) or []
min_cooldown_time = self.default_cooldown_time
min_cooldown_time: Optional[float] = None
# Process the results
for model_id, result in zip(model_ids, results):
if result and isinstance(result, dict):
cooldown_cache_value = CooldownCacheValue(**result) # type: ignore
if cooldown_cache_value["cooldown_time"] < min_cooldown_time:
if min_cooldown_time is None:
min_cooldown_time = cooldown_cache_value["cooldown_time"]
elif cooldown_cache_value["cooldown_time"] < min_cooldown_time:
min_cooldown_time = cooldown_cache_value["cooldown_time"]
return min_cooldown_time
return min_cooldown_time or self.default_cooldown_time
# Usage example: