fix(router.py): cooldown on 404 errors

https://github.com/BerriAI/litellm/issues/3884
2025-04-27 11:43:54 +00:00 · 2024-05-30 10:57:38 -07:00 · 2024-05-30 10:57:38 -07:00 · 32bfb685f5
commit 32bfb685f5
parent 3167bee25a
2 changed files with 13 additions and 33 deletions
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -1,41 +1,15 @@
 general_settings:
  alert_to_webhook_url:
    budget_alerts: https://hooks.slack.com/services/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
    daily_reports: https://hooks.slack.com/services/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
    db_exceptions: https://hooks.slack.com/services/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
    llm_exceptions: https://hooks.slack.com/services/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
    llm_requests_hanging: https://hooks.slack.com/services/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
    llm_too_slow: https://hooks.slack.com/services/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
    outage_alerts: https://hooks.slack.com/services/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
  alert_types:
  - llm_exceptions
  - llm_too_slow
  - llm_requests_hanging
  - budget_alerts
  - db_exceptions
  - daily_reports
  - spend_reports
  - cooldown_deployment
  - new_model_added
  - outage_alerts
  alerting:
  - slack
  database_connection_pool_limit: 100
  database_connection_timeout: 60
  health_check_interval: 300
  ui_access_mode: all
 # litellm_settings:
 #   json_logs: true
 model_list:
 - litellm_params:
    api_base: http://0.0.0.0:8080
    api_key: ''
    model: openai/my-fake-model
    rpm: 800
  model_name: gpt-3.5-turbo-fake-model
 - litellm_params:
    api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
    api_key: os.environ/AZURE_EUROPE_API_KEY
    model: azure/gpt-35-turbo
    rpm: 10
  model_name: gpt-3.5-turbo-fake-model
 - litellm_params:
    api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
--- a/litellm/router.py
+++ b/litellm/router.py
@ -103,7 +103,9 @@ class Router:
        allowed_fails: Optional[
            int
        ] = None,  # Number of times a deployment can failbefore being added to cooldown
-        cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
+        cooldown_time: Optional[
            float
        ] = None,  # (seconds) time to cooldown a deployment after failure
        routing_strategy: Literal[
            "simple-shuffle",
            "least-busy",
@ -248,7 +250,7 @@ class Router:
            )  # initialize an empty list - to allow _add_deployment and delete_deployment to work
        self.allowed_fails = allowed_fails or litellm.allowed_fails
-        self.cooldown_time = cooldown_time or 1
+        self.cooldown_time = cooldown_time or 60
        self.failed_calls = (
            InMemoryCache()
        )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -1850,7 +1852,8 @@ class Router:
                    )
                    await asyncio.sleep(_timeout)
            try:
-                original_exception.message += f"\nNumber Retries = {current_attempt}"
+                cooldown_deployments = await self._async_get_cooldown_deployments()
                original_exception.message += f"\nNumber Retries = {current_attempt + 1}, Max Retries={num_retries}\nCooldown Deployments={cooldown_deployments}"
            except:
                pass
            raise original_exception
@ -2143,7 +2146,7 @@ class Router:
                    )
                )
-                if _time_to_cooldown < 0:
+                if _time_to_cooldown is None or _time_to_cooldown < 0:
                    # if the response headers did not read it -> set to default cooldown time
                    _time_to_cooldown = self.cooldown_time
@ -2239,6 +2242,9 @@ class Router:
                elif exception_status == 408:
                    return True
                elif exception_status == 404:
                    return True
                else:
                    # Do NOT cool down all other 4XX Errors
                    return False
@ -2264,6 +2270,7 @@ class Router:
        the exception is not one that should be immediately retried (e.g. 401)
        """
        args = locals()
        if deployment is None:
            return
@ -2296,7 +2303,6 @@ class Router:
                )
                exception_status = 500
        _should_retry = litellm._should_retry(status_code=exception_status)
        if updated_fails > self.allowed_fails or _should_retry == False:
            # get the current cooldown list for that minute
            cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls