From a0ecc6f414cfda5d748fc3f97d6d426d63057ca8 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 14 Jun 2024 08:41:12 -0700
Subject: [PATCH 1/4] fix - send alert on router level exceptions

---
 litellm/integrations/slack_alerting.py |  2 +-
 litellm/router.py                      | 11 ++++++
 litellm/router_utils/handle_error.py   | 47 ++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 litellm/router_utils/handle_error.py

diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py
index 21415fb6d..f37f5070f 100644
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@@ -1453,7 +1453,7 @@ Model Info:
             pass
         else:
             verbose_proxy_logger.debug(
-                "Error sending slack alert. Error=", response.text
+                "Error sending slack alert. Error={}".format(response.text)
             )
 
     async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
diff --git a/litellm/router.py b/litellm/router.py
index 4d7a36a38..f07a82d8b 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -66,6 +66,7 @@ from litellm.types.llms.openai import (
 )
 from litellm.scheduler import Scheduler, FlowItem
 from typing import Iterable
+from litellm.router_utils.handle_error import send_llm_exception_alert
 
 
 class Router:
@@ -576,6 +577,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _acompletion(
@@ -4570,6 +4579,8 @@ class Router:
             default_webhook_url=router_alerting_config.webhook_url,
         )
 
+        self.slack_alerting_logger = _slack_alerting_logger
+
         litellm.callbacks.append(_slack_alerting_logger)
         litellm.success_callback.append(
             _slack_alerting_logger.response_taking_too_long_callback
diff --git a/litellm/router_utils/handle_error.py b/litellm/router_utils/handle_error.py
new file mode 100644
index 000000000..e9fb2d390
--- /dev/null
+++ b/litellm/router_utils/handle_error.py
@@ -0,0 +1,47 @@
+import asyncio
+import traceback
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from litellm.router import Router as _Router
+
+    LitellmRouter = _Router
+else:
+    LitellmRouter = Any
+
+
+async def send_llm_exception_alert(
+    litellm_router_instance: LitellmRouter,
+    request_kwargs: dict,
+    error_traceback_str: str,
+    original_exception,
+):
+    """
+    Sends a Slack / MS Teams alert for the LLM API call failure.
+
+    Parameters:
+        litellm_router_instance (_Router): The LitellmRouter instance.
+        original_exception (Any): The original exception that occurred.
+
+    Returns:
+        None
+    """
+    if litellm_router_instance.slack_alerting_logger is None:
+        return
+
+    if "proxy_server_request" in request_kwargs:
+        # Do not send any alert if it's a request from litellm proxy server request
+        # the proxy is already instrumented to send LLM API call failures
+        return
+
+    litellm_debug_info = getattr(original_exception, "litellm_debug_info", None)
+    exception_str = str(original_exception)
+    if litellm_debug_info is not None:
+        exception_str += litellm_debug_info
+    exception_str += f"\n\n{error_traceback_str[:2000]}"
+
+    await litellm_router_instance.slack_alerting_logger.send_alert(
+        message=f"LLM API call failed: `{exception_str}`",
+        level="High",
+        alert_type="llm_exceptions",
+    )

From 63ddc3d0128db7bc0d4d1fd53fa68b48e802a660 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 14 Jun 2024 08:45:04 -0700
Subject: [PATCH 2/4] test_alerting

---
 litellm/tests/test_alerting.py | 37 ++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py
index 9dfec3dcf..0cbf6cb6d 100644
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@@ -25,6 +25,9 @@ import pytest
 from litellm.router import AlertingConfig, Router
 from litellm.proxy._types import CallInfo
 from openai import APIError
+from litellm.router import AlertingConfig
+import litellm
+import os
 
 
 @pytest.mark.parametrize(
@@ -743,3 +746,37 @@ async def test_region_outage_alerting_called(
             mock_send_alert.assert_called_once()
         else:
             mock_send_alert.assert_not_called()
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip(reason="test only needs to run locally ")
+async def test_alerting():
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_key": "bad_key",
+                },
+            }
+        ],
+        debug_level="DEBUG",
+        set_verbose=True,
+        alerting_config=AlertingConfig(
+            alerting_threshold=10,  # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
+            webhook_url=os.getenv(
+                "SLACK_WEBHOOK_URL"
+            ),  # webhook you want to send alerts to
+        ),
+    )
+    try:
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+
+    except:
+        pass
+    finally:
+        await asyncio.sleep(3)

From bd5d1be1f6f53a703e8d4c0911b4a1d87a3cb47d Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 14 Jun 2024 10:11:24 -0700
Subject: [PATCH 3/4] feat - send llm exception alert on acompletion,
 aembedding etc

---
 litellm/router.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/litellm/router.py b/litellm/router.py
index f07a82d8b..491a34d1f 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -1106,6 +1106,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _aimage_generation(self, prompt: str, model: str, **kwargs):
@@ -1230,6 +1238,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _atranscription(self, file: BinaryIO, model: str, **kwargs):
@@ -1396,6 +1412,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def amoderation(self, model: str, input: str, **kwargs):
@@ -1411,6 +1435,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _amoderation(self, model: str, input: str, **kwargs):
@@ -1555,6 +1587,14 @@ class Router:
 
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _atext_completion(self, model: str, prompt: str, **kwargs):
@@ -1750,6 +1790,14 @@ class Router:
             response = await self.async_function_with_fallbacks(**kwargs)
             return response
         except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
             raise e
 
     async def _aembedding(self, input: Union[str, List], model: str, **kwargs):

From 2d2650a2b64fc348845108170df9169d24ead097 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 14 Jun 2024 15:17:32 -0700
Subject: [PATCH 4/4] fix use safe access for router alerting

---
 litellm/router_utils/handle_error.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/router_utils/handle_error.py b/litellm/router_utils/handle_error.py
index e9fb2d390..d848fd82b 100644
--- a/litellm/router_utils/handle_error.py
+++ b/litellm/router_utils/handle_error.py
@@ -26,6 +26,12 @@ async def send_llm_exception_alert(
     Returns:
         None
     """
+    if litellm_router_instance is None:
+        return
+
+    if not hasattr(litellm_router_instance, "slack_alerting_logger"):
+        return
+
     if litellm_router_instance.slack_alerting_logger is None:
         return