fix(slack_alerting.py): don't fire spam alerts when backend api call fails

This commit is contained in:
Krrish Dholakia 2024-05-13 10:04:43 -07:00
parent b063ef7a47
commit 13e1577753
4 changed files with 69 additions and 21 deletions

View file

@ -76,16 +76,14 @@ class SlackAlerting(CustomLogger):
internal_usage_cache: Optional[DualCache] = None, internal_usage_cache: Optional[DualCache] = None,
alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds) alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds)
alerting: Optional[List] = [], alerting: Optional[List] = [],
alert_types: Optional[ alert_types: List[
List[ Literal[
Literal[ "llm_exceptions",
"llm_exceptions", "llm_too_slow",
"llm_too_slow", "llm_requests_hanging",
"llm_requests_hanging", "budget_alerts",
"budget_alerts", "db_exceptions",
"db_exceptions", "daily_reports",
"daily_reports",
]
] ]
] = [ ] = [
"llm_exceptions", "llm_exceptions",
@ -812,14 +810,6 @@ Model Info:
updated_at=litellm.utils.get_utc_datetime(), updated_at=litellm.utils.get_utc_datetime(),
) )
) )
if "llm_exceptions" in self.alert_types:
original_exception = kwargs.get("exception", None)
await self.send_alert(
message="LLM API Failure - " + str(original_exception),
level="High",
alert_type="llm_exceptions",
)
async def _run_scheduler_helper(self, llm_router) -> bool: async def _run_scheduler_helper(self, llm_router) -> bool:
""" """

View file

@ -8,6 +8,16 @@ model_list:
base_model: text-embedding-ada-002 base_model: text-embedding-ada-002
mode: embedding mode: embedding
model_name: text-embedding-ada-002 model_name: text-embedding-ada-002
- model_name: gpt-3.5-turbo-012
litellm_params:
model: gpt-3.5-turbo
api_base: http://0.0.0.0:8080
api_key: ""
- model_name: gpt-3.5-turbo-0125-preview
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
router_settings: router_settings:
redis_host: redis redis_host: redis
@ -17,6 +27,7 @@ router_settings:
litellm_settings: litellm_settings:
set_verbose: True set_verbose: True
fallbacks: [{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}]
# service_callback: ["prometheus_system"] # service_callback: ["prometheus_system"]
# success_callback: ["prometheus"] # success_callback: ["prometheus"]
# failure_callback: ["prometheus"] # failure_callback: ["prometheus"]
@ -25,4 +36,5 @@ general_settings:
enable_jwt_auth: True enable_jwt_auth: True
disable_reset_budget: True disable_reset_budget: True
proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds) proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
alerting: ["slack"]

View file

@ -1413,7 +1413,7 @@ class Router:
verbose_router_logger.debug(f"Trying to fallback b/w models") verbose_router_logger.debug(f"Trying to fallback b/w models")
if ( if (
hasattr(e, "status_code") hasattr(e, "status_code")
and e.status_code == 400 and e.status_code == 400 # type: ignore
and not isinstance(e, litellm.ContextWindowExceededError) and not isinstance(e, litellm.ContextWindowExceededError)
): # don't retry a malformed request ): # don't retry a malformed request
raise e raise e
@ -3648,7 +3648,7 @@ class Router:
) )
asyncio.create_task( asyncio.create_task(
proxy_logging_obj.slack_alerting_instance.send_alert( proxy_logging_obj.slack_alerting_instance.send_alert(
message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}", message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_failes' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
alert_type="cooldown_deployment", alert_type="cooldown_deployment",
level="Low", level="Low",
) )

View file

@ -961,3 +961,49 @@ def test_custom_cooldown_times():
except Exception as e: except Exception as e:
print(e) print(e)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_service_unavailable_fallbacks(sync_mode):
"""
Initial model - openai
Fallback - azure
Error - 503, service unavailable
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo-012",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "anything",
"api_base": "http://0.0.0.0:8080",
},
},
{
"model_name": "gpt-3.5-turbo-0125-preview",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
},
],
fallbacks=[{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}],
)
if sync_mode:
response = router.completion(
model="gpt-3.5-turbo-012",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
else:
response = await router.acompletion(
model="gpt-3.5-turbo-012",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
assert response.model == "gpt-35-turbo"