fix(slack_alerting.py): don't fire spam alerts when backend api call fails

2024-05-13 10:04:43 -07:00 · 2024-05-13 10:04:43 -07:00 · 13e1577753
commit 13e1577753
parent b063ef7a47
4 changed files with 69 additions and 21 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -76,16 +76,14 @@ class SlackAlerting(CustomLogger):
        internal_usage_cache: Optional[DualCache] = None,
        alerting_threshold: float = 300,  # threshold for slow / hanging llm responses (in seconds)
        alerting: Optional[List] = [],
-        alert_types: Optional[
-            List[
-                Literal[
-                    "llm_exceptions",
-                    "llm_too_slow",
-                    "llm_requests_hanging",
-                    "budget_alerts",
-                    "db_exceptions",
-                    "daily_reports",
-                ]
+        alert_types: List[
+            Literal[
+                "llm_exceptions",
+                "llm_too_slow",
+                "llm_requests_hanging",
+                "budget_alerts",
+                "db_exceptions",
+                "daily_reports",
            ]
        ] = [
            "llm_exceptions",
@ -812,14 +810,6 @@ Model Info:
                    updated_at=litellm.utils.get_utc_datetime(),
                )
            )
-        if "llm_exceptions" in self.alert_types:
-            original_exception = kwargs.get("exception", None)
-
-            await self.send_alert(
-                message="LLM API Failure - " + str(original_exception),
-                level="High",
-                alert_type="llm_exceptions",
-            )

    async def _run_scheduler_helper(self, llm_router) -> bool:
        """
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -8,6 +8,16 @@ model_list:
    base_model: text-embedding-ada-002
    mode: embedding
  model_name: text-embedding-ada-002
+- model_name: gpt-3.5-turbo-012
+  litellm_params:
+    model: gpt-3.5-turbo
+    api_base: http://0.0.0.0:8080
+    api_key: "" 
+- model_name: gpt-3.5-turbo-0125-preview
+  litellm_params:
+    model: azure/chatgpt-v-2
+    api_key: os.environ/AZURE_API_KEY
+    api_base: os.environ/AZURE_API_BASE

 router_settings:
  redis_host: redis
@ -17,6 +27,7 @@ router_settings:

 litellm_settings:
  set_verbose: True
+  fallbacks: [{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}]
  # service_callback: ["prometheus_system"]
  # success_callback: ["prometheus"]
  # failure_callback: ["prometheus"]
@ -26,3 +37,4 @@ general_settings:
  disable_reset_budget: True
  proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
  routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  alerting: ["slack"]
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1413,7 +1413,7 @@ class Router:
                verbose_router_logger.debug(f"Trying to fallback b/w models")
                if (
                    hasattr(e, "status_code")
-                    and e.status_code == 400
+                    and e.status_code == 400  # type: ignore
                    and not isinstance(e, litellm.ContextWindowExceededError)
                ):  # don't retry a malformed request
                    raise e
@ -3648,7 +3648,7 @@ class Router:
                )
                asyncio.create_task(
                    proxy_logging_obj.slack_alerting_instance.send_alert(
-                        message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
+                        message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_failes' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
                        alert_type="cooldown_deployment",
                        level="Low",
                    )
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -961,3 +961,49 @@ def test_custom_cooldown_times():

    except Exception as e:
        print(e)
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_service_unavailable_fallbacks(sync_mode):
+    """
+    Initial model - openai
+    Fallback - azure
+
+    Error - 503, service unavailable
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo-012",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_key": "anything",
+                    "api_base": "http://0.0.0.0:8080",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo-0125-preview",
+                "litellm_params": {
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+            },
+        ],
+        fallbacks=[{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}],
+    )
+
+    if sync_mode:
+        response = router.completion(
+            model="gpt-3.5-turbo-012",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+    else:
+        response = await router.acompletion(
+            model="gpt-3.5-turbo-012",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+
+    assert response.model == "gpt-35-turbo"