forked from phoenix/litellm-mirror
fix(slack_alerting.py): don't fire spam alerts when backend api call fails
This commit is contained in:
parent
b063ef7a47
commit
13e1577753
4 changed files with 69 additions and 21 deletions
|
@ -76,16 +76,14 @@ class SlackAlerting(CustomLogger):
|
||||||
internal_usage_cache: Optional[DualCache] = None,
|
internal_usage_cache: Optional[DualCache] = None,
|
||||||
alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds)
|
alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds)
|
||||||
alerting: Optional[List] = [],
|
alerting: Optional[List] = [],
|
||||||
alert_types: Optional[
|
alert_types: List[
|
||||||
List[
|
Literal[
|
||||||
Literal[
|
"llm_exceptions",
|
||||||
"llm_exceptions",
|
"llm_too_slow",
|
||||||
"llm_too_slow",
|
"llm_requests_hanging",
|
||||||
"llm_requests_hanging",
|
"budget_alerts",
|
||||||
"budget_alerts",
|
"db_exceptions",
|
||||||
"db_exceptions",
|
"daily_reports",
|
||||||
"daily_reports",
|
|
||||||
]
|
|
||||||
]
|
]
|
||||||
] = [
|
] = [
|
||||||
"llm_exceptions",
|
"llm_exceptions",
|
||||||
|
@ -812,14 +810,6 @@ Model Info:
|
||||||
updated_at=litellm.utils.get_utc_datetime(),
|
updated_at=litellm.utils.get_utc_datetime(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if "llm_exceptions" in self.alert_types:
|
|
||||||
original_exception = kwargs.get("exception", None)
|
|
||||||
|
|
||||||
await self.send_alert(
|
|
||||||
message="LLM API Failure - " + str(original_exception),
|
|
||||||
level="High",
|
|
||||||
alert_type="llm_exceptions",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _run_scheduler_helper(self, llm_router) -> bool:
|
async def _run_scheduler_helper(self, llm_router) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -8,6 +8,16 @@ model_list:
|
||||||
base_model: text-embedding-ada-002
|
base_model: text-embedding-ada-002
|
||||||
mode: embedding
|
mode: embedding
|
||||||
model_name: text-embedding-ada-002
|
model_name: text-embedding-ada-002
|
||||||
|
- model_name: gpt-3.5-turbo-012
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo
|
||||||
|
api_base: http://0.0.0.0:8080
|
||||||
|
api_key: ""
|
||||||
|
- model_name: gpt-3.5-turbo-0125-preview
|
||||||
|
litellm_params:
|
||||||
|
model: azure/chatgpt-v-2
|
||||||
|
api_key: os.environ/AZURE_API_KEY
|
||||||
|
api_base: os.environ/AZURE_API_BASE
|
||||||
|
|
||||||
router_settings:
|
router_settings:
|
||||||
redis_host: redis
|
redis_host: redis
|
||||||
|
@ -17,6 +27,7 @@ router_settings:
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
set_verbose: True
|
set_verbose: True
|
||||||
|
fallbacks: [{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}]
|
||||||
# service_callback: ["prometheus_system"]
|
# service_callback: ["prometheus_system"]
|
||||||
# success_callback: ["prometheus"]
|
# success_callback: ["prometheus"]
|
||||||
# failure_callback: ["prometheus"]
|
# failure_callback: ["prometheus"]
|
||||||
|
@ -25,4 +36,5 @@ general_settings:
|
||||||
enable_jwt_auth: True
|
enable_jwt_auth: True
|
||||||
disable_reset_budget: True
|
disable_reset_budget: True
|
||||||
proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
|
proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
|
||||||
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||||
|
alerting: ["slack"]
|
||||||
|
|
|
@ -1413,7 +1413,7 @@ class Router:
|
||||||
verbose_router_logger.debug(f"Trying to fallback b/w models")
|
verbose_router_logger.debug(f"Trying to fallback b/w models")
|
||||||
if (
|
if (
|
||||||
hasattr(e, "status_code")
|
hasattr(e, "status_code")
|
||||||
and e.status_code == 400
|
and e.status_code == 400 # type: ignore
|
||||||
and not isinstance(e, litellm.ContextWindowExceededError)
|
and not isinstance(e, litellm.ContextWindowExceededError)
|
||||||
): # don't retry a malformed request
|
): # don't retry a malformed request
|
||||||
raise e
|
raise e
|
||||||
|
@ -3648,7 +3648,7 @@ class Router:
|
||||||
)
|
)
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
proxy_logging_obj.slack_alerting_instance.send_alert(
|
proxy_logging_obj.slack_alerting_instance.send_alert(
|
||||||
message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
|
message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_failes' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
|
||||||
alert_type="cooldown_deployment",
|
alert_type="cooldown_deployment",
|
||||||
level="Low",
|
level="Low",
|
||||||
)
|
)
|
||||||
|
|
|
@ -961,3 +961,49 @@ def test_custom_cooldown_times():
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_service_unavailable_fallbacks(sync_mode):
|
||||||
|
"""
|
||||||
|
Initial model - openai
|
||||||
|
Fallback - azure
|
||||||
|
|
||||||
|
Error - 503, service unavailable
|
||||||
|
"""
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo-012",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "anything",
|
||||||
|
"api_base": "http://0.0.0.0:8080",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo-0125-preview",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
fallbacks=[{"gpt-3.5-turbo-012": ["gpt-3.5-turbo-0125-preview"]}],
|
||||||
|
)
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
response = router.completion(
|
||||||
|
model="gpt-3.5-turbo-012",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo-012",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.model == "gpt-35-turbo"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue