mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
fix(router.py): cooldown on 404 errors
https://github.com/BerriAI/litellm/issues/3884
This commit is contained in:
parent
3167bee25a
commit
32bfb685f5
2 changed files with 13 additions and 33 deletions
|
@ -1,41 +1,15 @@
|
||||||
general_settings:
|
|
||||||
alert_to_webhook_url:
|
|
||||||
budget_alerts: https://hooks.slack.com/services/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
|
|
||||||
daily_reports: https://hooks.slack.com/services/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
|
|
||||||
db_exceptions: https://hooks.slack.com/services/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
|
|
||||||
llm_exceptions: https://hooks.slack.com/services/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
|
|
||||||
llm_requests_hanging: https://hooks.slack.com/services/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
|
|
||||||
llm_too_slow: https://hooks.slack.com/services/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
|
|
||||||
outage_alerts: https://hooks.slack.com/services/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
|
|
||||||
alert_types:
|
|
||||||
- llm_exceptions
|
|
||||||
- llm_too_slow
|
|
||||||
- llm_requests_hanging
|
|
||||||
- budget_alerts
|
|
||||||
- db_exceptions
|
|
||||||
- daily_reports
|
|
||||||
- spend_reports
|
|
||||||
- cooldown_deployment
|
|
||||||
- new_model_added
|
|
||||||
- outage_alerts
|
|
||||||
alerting:
|
|
||||||
- slack
|
|
||||||
database_connection_pool_limit: 100
|
|
||||||
database_connection_timeout: 60
|
|
||||||
health_check_interval: 300
|
|
||||||
ui_access_mode: all
|
|
||||||
# litellm_settings:
|
|
||||||
# json_logs: true
|
|
||||||
model_list:
|
model_list:
|
||||||
- litellm_params:
|
- litellm_params:
|
||||||
api_base: http://0.0.0.0:8080
|
api_base: http://0.0.0.0:8080
|
||||||
api_key: ''
|
api_key: ''
|
||||||
model: openai/my-fake-model
|
model: openai/my-fake-model
|
||||||
|
rpm: 800
|
||||||
model_name: gpt-3.5-turbo-fake-model
|
model_name: gpt-3.5-turbo-fake-model
|
||||||
- litellm_params:
|
- litellm_params:
|
||||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||||
api_key: os.environ/AZURE_EUROPE_API_KEY
|
api_key: os.environ/AZURE_EUROPE_API_KEY
|
||||||
model: azure/gpt-35-turbo
|
model: azure/gpt-35-turbo
|
||||||
|
rpm: 10
|
||||||
model_name: gpt-3.5-turbo-fake-model
|
model_name: gpt-3.5-turbo-fake-model
|
||||||
- litellm_params:
|
- litellm_params:
|
||||||
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
||||||
|
|
|
@ -103,7 +103,9 @@ class Router:
|
||||||
allowed_fails: Optional[
|
allowed_fails: Optional[
|
||||||
int
|
int
|
||||||
] = None, # Number of times a deployment can failbefore being added to cooldown
|
] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||||
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
cooldown_time: Optional[
|
||||||
|
float
|
||||||
|
] = None, # (seconds) time to cooldown a deployment after failure
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
|
@ -248,7 +250,7 @@ class Router:
|
||||||
) # initialize an empty list - to allow _add_deployment and delete_deployment to work
|
) # initialize an empty list - to allow _add_deployment and delete_deployment to work
|
||||||
|
|
||||||
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
||||||
self.cooldown_time = cooldown_time or 1
|
self.cooldown_time = cooldown_time or 60
|
||||||
self.failed_calls = (
|
self.failed_calls = (
|
||||||
InMemoryCache()
|
InMemoryCache()
|
||||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||||
|
@ -1850,7 +1852,8 @@ class Router:
|
||||||
)
|
)
|
||||||
await asyncio.sleep(_timeout)
|
await asyncio.sleep(_timeout)
|
||||||
try:
|
try:
|
||||||
original_exception.message += f"\nNumber Retries = {current_attempt}"
|
cooldown_deployments = await self._async_get_cooldown_deployments()
|
||||||
|
original_exception.message += f"\nNumber Retries = {current_attempt + 1}, Max Retries={num_retries}\nCooldown Deployments={cooldown_deployments}"
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
@ -2143,7 +2146,7 @@ class Router:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if _time_to_cooldown < 0:
|
if _time_to_cooldown is None or _time_to_cooldown < 0:
|
||||||
# if the response headers did not read it -> set to default cooldown time
|
# if the response headers did not read it -> set to default cooldown time
|
||||||
_time_to_cooldown = self.cooldown_time
|
_time_to_cooldown = self.cooldown_time
|
||||||
|
|
||||||
|
@ -2239,6 +2242,9 @@ class Router:
|
||||||
elif exception_status == 408:
|
elif exception_status == 408:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
elif exception_status == 404:
|
||||||
|
return True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Do NOT cool down all other 4XX Errors
|
# Do NOT cool down all other 4XX Errors
|
||||||
return False
|
return False
|
||||||
|
@ -2264,6 +2270,7 @@ class Router:
|
||||||
|
|
||||||
the exception is not one that should be immediately retried (e.g. 401)
|
the exception is not one that should be immediately retried (e.g. 401)
|
||||||
"""
|
"""
|
||||||
|
args = locals()
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -2296,7 +2303,6 @@ class Router:
|
||||||
)
|
)
|
||||||
exception_status = 500
|
exception_status = 500
|
||||||
_should_retry = litellm._should_retry(status_code=exception_status)
|
_should_retry = litellm._should_retry(status_code=exception_status)
|
||||||
|
|
||||||
if updated_fails > self.allowed_fails or _should_retry == False:
|
if updated_fails > self.allowed_fails or _should_retry == False:
|
||||||
# get the current cooldown list for that minute
|
# get the current cooldown list for that minute
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue