fix(router.py): cooldown on 404 errors

https://github.com/BerriAI/litellm/issues/3884
This commit is contained in:
Krrish Dholakia 2024-05-30 10:57:38 -07:00
parent 3167bee25a
commit 32bfb685f5
2 changed files with 13 additions and 33 deletions

View file

@ -1,41 +1,15 @@
general_settings:
alert_to_webhook_url:
budget_alerts: https://hooks.slack.com/services/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
daily_reports: https://hooks.slack.com/services/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
db_exceptions: https://hooks.slack.com/services/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
llm_exceptions: https://hooks.slack.com/services/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
llm_requests_hanging: https://hooks.slack.com/services/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
llm_too_slow: https://hooks.slack.com/services/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
outage_alerts: https://hooks.slack.com/services/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
alert_types:
- llm_exceptions
- llm_too_slow
- llm_requests_hanging
- budget_alerts
- db_exceptions
- daily_reports
- spend_reports
- cooldown_deployment
- new_model_added
- outage_alerts
alerting:
- slack
database_connection_pool_limit: 100
database_connection_timeout: 60
health_check_interval: 300
ui_access_mode: all
# litellm_settings:
# json_logs: true
model_list: model_list:
- litellm_params: - litellm_params:
api_base: http://0.0.0.0:8080 api_base: http://0.0.0.0:8080
api_key: '' api_key: ''
model: openai/my-fake-model model: openai/my-fake-model
rpm: 800
model_name: gpt-3.5-turbo-fake-model model_name: gpt-3.5-turbo-fake-model
- litellm_params: - litellm_params:
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: os.environ/AZURE_EUROPE_API_KEY api_key: os.environ/AZURE_EUROPE_API_KEY
model: azure/gpt-35-turbo model: azure/gpt-35-turbo
rpm: 10
model_name: gpt-3.5-turbo-fake-model model_name: gpt-3.5-turbo-fake-model
- litellm_params: - litellm_params:
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ api_base: https://openai-gpt-4-test-v-1.openai.azure.com/

View file

@ -103,7 +103,9 @@ class Router:
allowed_fails: Optional[ allowed_fails: Optional[
int int
] = None, # Number of times a deployment can failbefore being added to cooldown ] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure cooldown_time: Optional[
float
] = None, # (seconds) time to cooldown a deployment after failure
routing_strategy: Literal[ routing_strategy: Literal[
"simple-shuffle", "simple-shuffle",
"least-busy", "least-busy",
@ -248,7 +250,7 @@ class Router:
) # initialize an empty list - to allow _add_deployment and delete_deployment to work ) # initialize an empty list - to allow _add_deployment and delete_deployment to work
self.allowed_fails = allowed_fails or litellm.allowed_fails self.allowed_fails = allowed_fails or litellm.allowed_fails
self.cooldown_time = cooldown_time or 1 self.cooldown_time = cooldown_time or 60
self.failed_calls = ( self.failed_calls = (
InMemoryCache() InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown ) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -1850,7 +1852,8 @@ class Router:
) )
await asyncio.sleep(_timeout) await asyncio.sleep(_timeout)
try: try:
original_exception.message += f"\nNumber Retries = {current_attempt}" cooldown_deployments = await self._async_get_cooldown_deployments()
original_exception.message += f"\nNumber Retries = {current_attempt + 1}, Max Retries={num_retries}\nCooldown Deployments={cooldown_deployments}"
except: except:
pass pass
raise original_exception raise original_exception
@ -2143,7 +2146,7 @@ class Router:
) )
) )
if _time_to_cooldown < 0: if _time_to_cooldown is None or _time_to_cooldown < 0:
# if the response headers did not read it -> set to default cooldown time # if the response headers did not read it -> set to default cooldown time
_time_to_cooldown = self.cooldown_time _time_to_cooldown = self.cooldown_time
@ -2239,6 +2242,9 @@ class Router:
elif exception_status == 408: elif exception_status == 408:
return True return True
elif exception_status == 404:
return True
else: else:
# Do NOT cool down all other 4XX Errors # Do NOT cool down all other 4XX Errors
return False return False
@ -2264,6 +2270,7 @@ class Router:
the exception is not one that should be immediately retried (e.g. 401) the exception is not one that should be immediately retried (e.g. 401)
""" """
args = locals()
if deployment is None: if deployment is None:
return return
@ -2296,7 +2303,6 @@ class Router:
) )
exception_status = 500 exception_status = 500
_should_retry = litellm._should_retry(status_code=exception_status) _should_retry = litellm._should_retry(status_code=exception_status)
if updated_fails > self.allowed_fails or _should_retry == False: if updated_fails > self.allowed_fails or _should_retry == False:
# get the current cooldown list for that minute # get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls