diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index d03922bc1..b06a22920 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -76,16 +76,14 @@ class SlackAlerting(CustomLogger): internal_usage_cache: Optional[DualCache] = None, alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds) alerting: Optional[List] = [], - alert_types: Optional[ - List[ - Literal[ - "llm_exceptions", - "llm_too_slow", - "llm_requests_hanging", - "budget_alerts", - "db_exceptions", - "daily_reports", - ] + alert_types: List[ + Literal[ + "llm_exceptions", + "llm_too_slow", + "llm_requests_hanging", + "budget_alerts", + "db_exceptions", + "daily_reports", ] ] = [ "llm_exceptions", @@ -812,14 +810,6 @@ Model Info: updated_at=litellm.utils.get_utc_datetime(), ) ) - if "llm_exceptions" in self.alert_types: - original_exception = kwargs.get("exception", None) - - await self.send_alert( - message="LLM API Failure - " + str(original_exception), - level="High", - alert_type="llm_exceptions", - ) async def _run_scheduler_helper(self, llm_router) -> bool: """ diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 86037caf7..b83883beb 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -8,6 +8,16 @@ model_list: base_model: text-embedding-ada-002 mode: embedding model_name: text-embedding-ada-002 +- model_name: gpt-3.5-turbo-012 + litellm_params: + model: gpt-3.5-turbo + api_base: http://0.0.0.0:8080 + api_key: "" +- model_name: gpt-3.5-turbo-0125-preview + litellm_params: + model: azure/chatgpt-v-2 + api_key: os.environ/AZURE_API_KEY + api_base: os.environ/AZURE_API_BASE router_settings: redis_host: redis @@ -17,6 +27,7 @@ router_settings: litellm_settings: set_verbose: True + fallbacks: [{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}] # service_callback: ["prometheus_system"] # success_callback: ["prometheus"] # failure_callback: ["prometheus"] @@ -25,4 +36,5 @@ general_settings: enable_jwt_auth: True disable_reset_budget: True proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds) - routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" \ No newline at end of file + routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + alerting: ["slack"] diff --git a/litellm/router.py b/litellm/router.py index ad11dc98e..b345a2f25 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1413,7 +1413,7 @@ class Router: verbose_router_logger.debug(f"Trying to fallback b/w models") if ( hasattr(e, "status_code") - and e.status_code == 400 + and e.status_code == 400 # type: ignore and not isinstance(e, litellm.ContextWindowExceededError) ): # don't retry a malformed request raise e @@ -3648,7 +3648,7 @@ class Router: ) asyncio.create_task( proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}", + message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_failes' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", alert_type="cooldown_deployment", level="Low", ) diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index c1035e3e0..ce2b014e9 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -961,3 +961,49 @@ def test_custom_cooldown_times(): except Exception as e: print(e) + + +@pytest.mark.parametrize("sync_mode", [True, False]) +@pytest.mark.asyncio +async def test_service_unavailable_fallbacks(sync_mode): + """ + Initial model - openai + Fallback - azure + + Error - 503, service unavailable + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo-012", + "litellm_params": { + "model": "gpt-3.5-turbo", + "api_key": "anything", + "api_base": "http://0.0.0.0:8080", + }, + }, + { + "model_name": "gpt-3.5-turbo-0125-preview", + "litellm_params": { + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + }, + ], + fallbacks=[{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}], + ) + + if sync_mode: + response = router.completion( + model="gpt-3.5-turbo-012", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + else: + response = await router.acompletion( + model="gpt-3.5-turbo-012", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + + assert response.model == "gpt-35-turbo"