diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md index 558426756..08030f478 100644 --- a/docs/my-website/docs/proxy/alerting.md +++ b/docs/my-website/docs/proxy/alerting.md @@ -32,41 +32,33 @@ Get a slack webhook url from https://api.slack.com/messaging/webhooks You can also use Discord Webhooks, see [here](#using-discord-webhooks) -### Step 2: Update config.yaml -- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts. -- Just for testing purposes, let's save a bad key to our proxy. +Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts. + +```bash +export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>" +``` + +### Step 2: Setup Proxy ```yaml -model_list: - model_name: "azure-model" - litellm_params: - model: "azure/gpt-35-turbo" - api_key: "my-bad-key" # 👈 bad key - general_settings: alerting: ["slack"] alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ - -environment_variables: - SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>" - SLACK_DAILY_REPORT_FREQUENCY: "86400" # 24 hours; Optional: defaults to 12 hours ``` - -### Step 3: Start proxy - +Start proxy ```bash $ litellm --config /path/to/config.yaml ``` -## Testing Alerting is Setup Correctly -Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel +### Step 3: Test it! -```shell -curl -X GET 'http://localhost:4000/health/services?service=slack' \ - -H 'Authorization: Bearer sk-1234' + +```bash +curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \ +-H 'Authorization: Bearer sk-1234' ``` ## Advanced - Redacting Messages from Alerts @@ -84,7 +76,34 @@ litellm_settings: ``` +## Advanced - Add Metadata to alerts +Add alerting metadata to proxy calls for debugging. + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages = [], + extra_body={ + "metadata": { + "alerting_metadata": { + "hello": "world" + } + } + } +) +``` + +**Expected Response** + + ## Advanced - Opting into specific alert types diff --git a/docs/my-website/img/alerting_metadata.png b/docs/my-website/img/alerting_metadata.png new file mode 100644 index 000000000..e75f0c72b Binary files /dev/null and b/docs/my-website/img/alerting_metadata.png differ diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index f37f5070f..79e1dc1ee 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -330,6 +330,7 @@ class SlackAlerting(CustomLogger): messages = "Message not logged. litellm.redact_messages_in_exceptions=True" request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`" slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" + alerting_metadata: dict = {} if time_difference_float > self.alerting_threshold: # add deployment latencies to alert if ( @@ -337,7 +338,7 @@ class SlackAlerting(CustomLogger): and "litellm_params" in kwargs and "metadata" in kwargs["litellm_params"] ): - _metadata = kwargs["litellm_params"]["metadata"] + _metadata: dict = kwargs["litellm_params"]["metadata"] request_info = litellm.utils._add_key_name_and_team_to_alert( request_info=request_info, metadata=_metadata ) @@ -349,10 +350,14 @@ class SlackAlerting(CustomLogger): request_info += ( f"\nAvailable Deployment Latencies\n{_deployment_latency_map}" ) + + if "alerting_metadata" in _metadata: + alerting_metadata = _metadata["alerting_metadata"] await self.send_alert( message=slow_message + request_info, level="Low", alert_type="llm_too_slow", + alerting_metadata=alerting_metadata, ) async def async_update_daily_reports( @@ -540,7 +545,12 @@ class SlackAlerting(CustomLogger): message += f"\n\nNext Run is at: `{time.time() + self.alerting_args.daily_report_frequency}`s" # send alert - await self.send_alert(message=message, level="Low", alert_type="daily_reports") + await self.send_alert( + message=message, + level="Low", + alert_type="daily_reports", + alerting_metadata={}, + ) return True @@ -582,6 +592,7 @@ class SlackAlerting(CustomLogger): await asyncio.sleep( self.alerting_threshold ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests + alerting_metadata: dict = {} if ( request_data is not None and request_data.get("litellm_status", "") != "success" @@ -606,7 +617,7 @@ class SlackAlerting(CustomLogger): ): # In hanging requests sometime it has not made it to the point where the deployment is passed to the `request_data`` # in that case we fallback to the api base set in the request metadata - _metadata = request_data["metadata"] + _metadata: dict = request_data["metadata"] _api_base = _metadata.get("api_base", "") request_info = litellm.utils._add_key_name_and_team_to_alert( @@ -615,6 +626,9 @@ class SlackAlerting(CustomLogger): if _api_base is None: _api_base = "" + + if "alerting_metadata" in _metadata: + alerting_metadata = _metadata["alerting_metadata"] request_info += f"\nAPI Base: `{_api_base}`" # only alert hanging responses if they have not been marked as success alerting_message = ( @@ -640,6 +654,7 @@ class SlackAlerting(CustomLogger): message=alerting_message + request_info, level="Medium", alert_type="llm_requests_hanging", + alerting_metadata=alerting_metadata, ) async def failed_tracking_alert(self, error_message: str): @@ -650,7 +665,10 @@ class SlackAlerting(CustomLogger): result = await _cache.async_get_cache(key=_cache_key) if result is None: await self.send_alert( - message=message, level="High", alert_type="budget_alerts" + message=message, + level="High", + alert_type="budget_alerts", + alerting_metadata={}, ) await _cache.async_set_cache( key=_cache_key, @@ -751,6 +769,7 @@ class SlackAlerting(CustomLogger): level="High", alert_type="budget_alerts", user_info=webhook_event, + alerting_metadata={}, ) await _cache.async_set_cache( key=_cache_key, @@ -941,7 +960,10 @@ class SlackAlerting(CustomLogger): ) # send minor alert await self.send_alert( - message=msg, level="Medium", alert_type="outage_alerts" + message=msg, + level="Medium", + alert_type="outage_alerts", + alerting_metadata={}, ) # set to true outage_value["minor_alert_sent"] = True @@ -963,7 +985,12 @@ class SlackAlerting(CustomLogger): ) # send minor alert - await self.send_alert(message=msg, level="High", alert_type="outage_alerts") + await self.send_alert( + message=msg, + level="High", + alert_type="outage_alerts", + alerting_metadata={}, + ) # set to true outage_value["major_alert_sent"] = True @@ -1062,7 +1089,10 @@ class SlackAlerting(CustomLogger): ) # send minor alert await self.send_alert( - message=msg, level="Medium", alert_type="outage_alerts" + message=msg, + level="Medium", + alert_type="outage_alerts", + alerting_metadata={}, ) # set to true outage_value["minor_alert_sent"] = True @@ -1081,7 +1111,10 @@ class SlackAlerting(CustomLogger): ) # send minor alert await self.send_alert( - message=msg, level="High", alert_type="outage_alerts" + message=msg, + level="High", + alert_type="outage_alerts", + alerting_metadata={}, ) # set to true outage_value["major_alert_sent"] = True @@ -1143,7 +1176,10 @@ Model Info: """ alert_val = self.send_alert( - message=message, level="Low", alert_type="new_model_added" + message=message, + level="Low", + alert_type="new_model_added", + alerting_metadata={}, ) if alert_val is not None and asyncio.iscoroutine(alert_val): @@ -1368,6 +1404,7 @@ Model Info: message: str, level: Literal["Low", "Medium", "High"], alert_type: Literal[AlertType], + alerting_metadata: dict, user_info: Optional[WebhookEvent] = None, **kwargs, ): @@ -1425,6 +1462,9 @@ Model Info: if kwargs: for key, value in kwargs.items(): formatted_message += f"\n\n{key}: `{value}`\n\n" + if alerting_metadata: + for key, value in alerting_metadata.items(): + formatted_message += f"\n\n*Alerting Metadata*: \n{key}: `{value}`\n\n" if _proxy_base_url is not None: formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`" @@ -1622,6 +1662,7 @@ Model Info: message=_weekly_spend_message, level="Low", alert_type="spend_reports", + alerting_metadata={}, ) except Exception as e: verbose_proxy_logger.error("Error sending weekly spend report", e) @@ -1673,6 +1714,7 @@ Model Info: message=_spend_message, level="Low", alert_type="spend_reports", + alerting_metadata={}, ) except Exception as e: verbose_proxy_logger.error("Error sending weekly spend report", e) diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 5504d7a61..15b8bc93e 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -79,8 +79,8 @@ litellm_settings: failure_callback: ["langfuse"] cache: true -# general_settings: -# alerting: ["email"] +general_settings: + alerting: ["slack"] # key_management_system: "aws_kms" # key_management_settings: # hosted_keys: ["LITELLM_MASTER_KEY"] diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index ebe30789d..7741e5382 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -455,6 +455,7 @@ class ProxyLogging: formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`" extra_kwargs = {} + alerting_metadata = {} if request_data is not None: _url = self.slack_alerting_instance._add_langfuse_trace_id_to_alert( request_data=request_data @@ -462,7 +463,12 @@ class ProxyLogging: if _url is not None: extra_kwargs["🪢 Langfuse Trace"] = _url formatted_message += "\n\n🪢 Langfuse Trace: {}".format(_url) - + if ( + "metadata" in request_data + and request_data["metadata"].get("alerting_metadata", None) is not None + and isinstance(request_data["metadata"]["alerting_metadata"], dict) + ): + alerting_metadata = request_data["metadata"]["alerting_metadata"] for client in self.alerting: if client == "slack": await self.slack_alerting_instance.send_alert( @@ -470,6 +476,7 @@ class ProxyLogging: level=level, alert_type=alert_type, user_info=None, + alerting_metadata=alerting_metadata, **extra_kwargs, ) elif client == "sentry": @@ -510,7 +517,7 @@ class ProxyLogging: ) if hasattr(self, "service_logging_obj"): - self.service_logging_obj.async_service_failure_hook( + await self.service_logging_obj.async_service_failure_hook( service=ServiceTypes.DB, duration=duration, error=error_message, diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index 0cbf6cb6d..643b59cc1 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -162,6 +162,29 @@ async def test_response_taking_too_long_callback(slack_alerting): mock_send_alert.assert_awaited_once() +@pytest.mark.asyncio +async def test_alerting_metadata(slack_alerting): + """ + Test alerting_metadata is propogated correctly for response taking too long + """ + start_time = datetime.now() + end_time = start_time + timedelta(seconds=301) + kwargs = { + "model": "test_model", + "messages": "test_messages", + "litellm_params": {"metadata": {"alerting_metadata": {"hello": "world"}}}, + } + with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: + + ## RESPONSE TAKING TOO LONG + await slack_alerting.response_taking_too_long_callback( + kwargs, None, start_time, end_time + ) + mock_send_alert.assert_awaited_once() + + assert "hello" in mock_send_alert.call_args[1]["alerting_metadata"] + + # Test for budget crossed @pytest.mark.asyncio async def test_budget_alerts_crossed(slack_alerting): @@ -207,7 +230,9 @@ async def test_send_alert(slack_alerting): slack_alerting.async_http_handler, "post", new=AsyncMock() ) as mock_post: mock_post.return_value.status_code = 200 - await slack_alerting.send_alert("Test message", "Low", "budget_alerts") + await slack_alerting.send_alert( + "Test message", "Low", "budget_alerts", alerting_metadata={} + ) mock_post.assert_awaited_once() @@ -266,7 +291,7 @@ async def test_daily_reports_completion(slack_alerting): await asyncio.sleep(3) response_val = await slack_alerting.send_daily_reports(router=router) - assert response_val == True + assert response_val is True mock_send_alert.assert_awaited_once() @@ -291,7 +316,7 @@ async def test_daily_reports_completion(slack_alerting): await asyncio.sleep(3) response_val = await slack_alerting.send_daily_reports(router=router) - assert response_val == True + assert response_val is True mock_send_alert.assert_awaited()