feat(slack_alerting.py): support sending daily reports on deployments

allow admin to easily know slow + failing deployments Closes https://github.com/BerriAI/litellm/issues/3483
2024-05-06 17:18:42 -07:00 · 2024-05-06 17:18:42 -07:00 · 718f423d7d
commit 718f423d7d
parent 863f9c60a2
7 changed files with 400 additions and 25 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -106,7 +106,7 @@ class InMemoryCache(BaseCache):
            return_val.append(val)
        return return_val

-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        # get the value
        init_value = await self.async_get_cache(key=key) or 0
        value = init_value + value
@ -423,12 +423,12 @@ class RedisCache(BaseCache):
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
            await self.flush_cache_buffer()  # logging done in here

-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        _redis_client = self.init_async_client()
        start_time = time.time()
        try:
            async with _redis_client as redis_client:
-                result = await redis_client.incr(name=key, amount=value)
+                result = await redis_client.incrbyfloat(name=key, amount=value)
                ## LOGGING ##
                end_time = time.time()
                _duration = end_time - start_time
@ -1382,18 +1382,41 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

+    async def async_batch_set_cache(
+        self, cache_list: list, local_only: bool = False, **kwargs
+    ):
+        """
+        Batch write values to the cache
+        """
+        print_verbose(
+            f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
+        )
+        try:
+            if self.in_memory_cache is not None:
+                await self.in_memory_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, **kwargs
+                )
+
+            if self.redis_cache is not None and local_only == False:
+                await self.redis_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, ttl=kwargs.get("ttl", None)
+                )
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+
    async def async_increment_cache(
-        self, key, value: int, local_only: bool = False, **kwargs
-    ) -> int:
+        self, key, value: float, local_only: bool = False, **kwargs
+    ) -> float:
        """
        Key - the key in cache

-        Value - int - the value you want to increment by
+        Value - float - the value you want to increment by

-        Returns - int - the incremented value
+        Returns - float - the incremented value
        """
        try:
-            result: int = value
+            result: float = value
            if self.in_memory_cache is not None:
                result = await self.in_memory_cache.async_increment(
                    key, value, **kwargs
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -2,23 +2,74 @@
 #    Class for sending Slack Alerts #
 import dotenv, os

+from litellm.proxy._types import UserAPIKeyAuth
+
 dotenv.load_dotenv()  # Loading env variables using dotenv
-import copy
-import traceback
 from litellm._logging import verbose_logger, verbose_proxy_logger
-import litellm
+import litellm, threading
 from typing import List, Literal, Any, Union, Optional, Dict
 from litellm.caching import DualCache
 import asyncio
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
+from pydantic import BaseModel
+from enum import Enum
+from datetime import datetime as dt, timedelta
+from litellm.integrations.custom_logger import CustomLogger


-class SlackAlerting:
+class LiteLLMBase(BaseModel):
+    """
+    Implements default functions, all pydantic objects should have.
+    """
+
+    def json(self, **kwargs):
+        try:
+            return self.model_dump()  # noqa
+        except:
+            # if using pydantic v1
+            return self.dict()
+
+
+class SlackArgs(LiteLLMBase):
+    daily_report_frequency: int = 12 * 60 * 60  # 12 hours
+
+
+class DeploymentMetrics(LiteLLMBase):
+    """
+    Metrics per deployment, stored in cache
+
+    Used for daily reporting
+    """
+
+    id: str
+    """id of deployment in router model list"""
+
+    failed_request: bool
+    """did it fail the request?"""
+
+    latency_per_output_token: Optional[float]
+    """latency/output token of deployment"""
+
+    updated_at: dt
+    """Current time of deployment being updated"""
+
+
+class SlackAlertingCacheKeys(Enum):
+    """
+    Enum for deployment daily metrics keys - {deployment_id}:{enum}
+    """
+
+    failed_requests_key = "failed_requests_daily_metrics"
+    latency_key = "latency_daily_metrics"
+
+
+class SlackAlerting(CustomLogger):
    # Class variables or attributes
    def __init__(
        self,
+        internal_usage_cache: DualCache,
        alerting_threshold: float = 300,
        alerting: Optional[List] = [],
        alert_types: Optional[
@ -29,6 +80,7 @@ class SlackAlerting:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
+                    "daily_reports",
                ]
            ]
        ] = [
@ -37,6 +89,7 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
        ],
        alert_to_webhook_url: Optional[
            Dict
@ -45,10 +98,10 @@ class SlackAlerting:
        self.alerting_threshold = alerting_threshold
        self.alerting = alerting
        self.alert_types = alert_types
-        self.internal_usage_cache = DualCache()
+        self.internal_usage_cache = internal_usage_cache
        self.async_http_handler = AsyncHTTPHandler()
        self.alert_to_webhook_url = alert_to_webhook_url
-        pass
+        self.is_running = False

    def update_values(
        self,
@ -196,8 +249,178 @@ class SlackAlerting:
                alert_type="llm_too_slow",
            )

-    async def log_failure_event(self, original_exception: Exception):
-        pass
+    async def async_update_daily_reports(
+        self, deployment_metrics: DeploymentMetrics
+    ) -> int:
+        """
+        Store the perf by deployment in cache
+        - Number of failed requests per deployment
+        - Latency / output tokens per deployment
+
+        'deployment_id:daily_metrics:failed_requests'
+        'deployment_id:daily_metrics:latency_per_output_token'
+
+        Returns
+            int - count of metrics set (1 - if just latency, 2 - if failed + latency)
+        """
+
+        return_val = 0
+        try:
+            ## FAILED REQUESTS ##
+            if deployment_metrics.failed_request:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id,
+                        SlackAlertingCacheKeys.failed_requests_key.value,
+                    ),
+                    value=1,
+                )
+
+                return_val += 1
+
+            ## LATENCY ##
+            if deployment_metrics.latency_per_output_token is not None:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
+                    ),
+                    value=deployment_metrics.latency_per_output_token,
+                )
+
+                return_val += 1
+
+            return return_val
+        except Exception as e:
+            return 0
+
+    async def send_daily_reports(self, router: litellm.Router) -> bool:
+        """
+        Send a daily report on:
+        - Top 5 deployments with most failed requests
+        - Top 5 slowest deployments (normalized by latency/output tokens)
+
+        Get the value from redis cache (if available) or in-memory and send it
+
+        Cleanup:
+        - reset values in cache -> prevent memory leak
+
+        Returns:
+            True -> if successfuly sent
+            False -> if not sent
+        """
+
+        ids = router.get_model_ids()
+
+        # get keys
+        failed_request_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
+            for id in ids
+        ]
+        latency_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
+        ]
+
+        combined_metrics_keys = failed_request_keys + latency_keys  # reduce cache calls
+
+        combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
+            keys=combined_metrics_keys
+        )  # [1, 2, None, ..]
+
+        all_none = True
+        for val in combined_metrics_values:
+            if val is not None:
+                all_none = False
+
+        if all_none:
+            return False
+
+        failed_request_values = combined_metrics_values[
+            : len(failed_request_keys)
+        ]  # # [1, 2, None, ..]
+        latency_values = combined_metrics_values[len(failed_request_keys) :]
+
+        # find top 5 failed
+        ## Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = 0
+        replaced_failed_values = [
+            value if value is not None else placeholder_value
+            for value in failed_request_values
+        ]
+
+        ## Get the indices of top 5 keys with the highest numerical values (ignoring None values)
+        top_5_failed = sorted(
+            range(len(replaced_failed_values)),
+            key=lambda i: replaced_failed_values[i],
+            reverse=True,
+        )[:5]
+
+        # find top 5 slowest
+        # Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = -1
+        replaced_slowest_values = [
+            value if value is not None else placeholder_value
+            for value in latency_values
+        ]
+
+        # Get the indices of top 5 values with the highest numerical values (ignoring None values)
+        top_5_slowest = sorted(
+            range(len(replaced_slowest_values)),
+            key=lambda i: replaced_slowest_values[i],
+            reverse=True,
+        )[:5]
+
+        # format alert -> return the litellm model name + api base
+        message = f"\n\nHere are today's key metrics 📈: \n\n"
+
+        message += "\n\n*❗️ Top 5 Deployments with Most Failed Requests:*\n\n"
+        for i in range(len(top_5_failed)):
+            key = failed_request_keys[top_5_failed[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if isinstance(_deployment, dict):
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                return False
+
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            if api_base is None:
+                api_base = ""
+            value = replaced_failed_values[top_5_failed[i]]
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`,  API Base: `{api_base}`\n"
+
+        message += "\n\n*😅 Top 5 Slowest Deployments:*\n\n"
+        for i in range(len(top_5_slowest)):
+            key = latency_keys[top_5_slowest[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if _deployment is not None:
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                deployment_name = ""
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            value = replaced_slowest_values[top_5_slowest[i]]
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency: `{value}`,  API Base: `{api_base}`\n\n"
+
+        # cache cleanup -> reset values to 0
+        latency_cache_keys = [(key, 0) for key in latency_keys]
+        failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
+        combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
+        await self.internal_usage_cache.async_batch_set_cache(
+            cache_list=combined_metrics_cache_keys
+        )
+
+        # send alert
+        await self.send_alert(message=message, level="Low", alert_type="daily_reports")
+
+        return True

    async def response_taking_too_long(
        self,
@ -414,6 +637,7 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
        ],
    ):
        """
@ -439,9 +663,12 @@ class SlackAlerting:
        # Get the current timestamp
        current_time = datetime.now().strftime("%H:%M:%S")
        _proxy_base_url = os.getenv("PROXY_BASE_URL", None)
-        formatted_message = (
-            f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
-        )
+        if alert_type == "daily_reports":
+            formatted_message = message
+        else:
+            formatted_message = (
+                f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
+            )
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"

@ -468,3 +695,36 @@ class SlackAlerting:
            pass
        else:
            print("Error sending slack alert. Error=", response.text)  # noqa
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        """Log deployment latency"""
+        model_id = kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
+        response_ms: timedelta = end_time - start_time
+
+        final_value = response_ms
+        total_tokens = 0
+
+        if isinstance(response_obj, litellm.ModelResponse):
+            completion_tokens = response_obj.usage.completion_tokens
+            final_value = float(response_ms.total_seconds() / completion_tokens)
+
+        await self.async_update_daily_reports(
+            DeploymentMetrics(
+                id=model_id,
+                failed_request=False,
+                latency_per_output_token=final_value,
+                updated_at=litellm.utils.get_utc_datetime(),
+            )
+        )
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        """Log failure + deployment latency"""
+        model_id = kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
+        await self.async_update_daily_reports(
+            DeploymentMetrics(
+                id=model_id,
+                failed_request=True,
+                latency_per_output_token=None,
+                updated_at=litellm.utils.get_utc_datetime(),
+            )
+        )
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -19,4 +19,4 @@ litellm_settings:

 general_settings:
  alerting: ["slack"]
-  alert_types: ["llm_exceptions"]
+  alert_types: ["llm_exceptions", "daily_reports"]
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -73,6 +73,7 @@ class ProxyLogging:
                "llm_requests_hanging",
                "budget_alerts",
                "db_exceptions",
+                "daily_reports",
            ]
        ] = [
            "llm_exceptions",
@ -80,11 +81,13 @@ class ProxyLogging:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
        ]
        self.slack_alerting_instance = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
            alert_types=self.alert_types,
+            internal_usage_cache=self.internal_usage_cache,
        )

    def update_values(
@ -100,6 +103,7 @@ class ProxyLogging:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
+                    "daily_reports",
                ]
            ]
        ] = None,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2597,7 +2597,10 @@ class Router:
                    return model
        return None

-    def get_model_ids(self):
+    def get_model_ids(self) -> List[str]:
+        """
+        Returns list of model id's.
+        """
        ids = []
        for model in self.model_list:
            if "model_info" in model and "id" in model["model_info"]:
@ -2605,7 +2608,7 @@ class Router:
                ids.append(id)
        return ids

-    def get_model_names(self):
+    def get_model_names(self) -> List[str]:
        return self.model_names

    def get_model_list(self):
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -17,7 +17,7 @@ import asyncio
 from unittest.mock import patch, MagicMock
 from litellm.utils import get_api_base
 from litellm.caching import DualCache
-from litellm.integrations.slack_alerting import SlackAlerting
+from litellm.integrations.slack_alerting import SlackAlerting, DeploymentMetrics


@pytest.mark.parametrize(
@ -116,7 +116,7 @@ from datetime import datetime, timedelta

@pytest.fixture
 def slack_alerting():
-    return SlackAlerting(alerting_threshold=1)
+    return SlackAlerting(alerting_threshold=1, internal_usage_cache=DualCache())


 # Test for hanging LLM responses
@ -185,3 +185,88 @@ async def test_send_alert(slack_alerting):
        mock_post.return_value.status_code = 200
        await slack_alerting.send_alert("Test message", "Low", "budget_alerts")
        mock_post.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_daily_reports_unit_test(slack_alerting):
+    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "test-gpt",
+                    "litellm_params": {"model": "gpt-3.5-turbo"},
+                    "model_info": {"id": "1234"},
+                }
+            ]
+        )
+        deployment_metrics = DeploymentMetrics(
+            id="1234",
+            failed_request=False,
+            latency_per_output_token=20.3,
+            updated_at=litellm.utils.get_utc_datetime(),
+        )
+
+        updated_val = await slack_alerting.async_update_daily_reports(
+            deployment_metrics=deployment_metrics
+        )
+
+        assert updated_val == 1
+
+        await slack_alerting.send_daily_reports(router=router)
+
+        mock_send_alert.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_daily_reports_completion(slack_alerting):
+    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
+        litellm.callbacks = [slack_alerting]
+
+        # on async success
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "gpt-5",
+                    "litellm_params": {
+                        "model": "gpt-3.5-turbo",
+                    },
+                }
+            ]
+        )
+
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+
+        await asyncio.sleep(3)
+        response_val = await slack_alerting.send_daily_reports(router=router)
+
+        assert response_val == True
+
+        mock_send_alert.assert_awaited_once()
+
+        # on async failure
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "gpt-5",
+                    "litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad_key"},
+                }
+            ]
+        )
+
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            )
+        except Exception as e:
+            pass
+
+        await asyncio.sleep(3)
+        response_val = await slack_alerting.send_daily_reports(router=router)
+
+        assert response_val == True
+
+        mock_send_alert.assert_awaited()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5825,7 +5825,7 @@ def get_api_base(model: str, optional_params: dict) -> Optional[str]:

    Parameters:
    - model: str - the model passed to litellm.completion()
-    - optional_params - the additional params passed to litellm.completion - eg. api_base, api_key, etc. See `LiteLLM_Params` - https://github.com/BerriAI/litellm/blob/f09e6ba98d65e035a79f73bc069145002ceafd36/litellm/router.py#L67
+    - optional_params - the 'litellm_params' in router.completion *OR* additional params passed to litellm.completion - eg. api_base, api_key, etc. See `LiteLLM_Params` - https://github.com/BerriAI/litellm/blob/f09e6ba98d65e035a79f73bc069145002ceafd36/litellm/router.py#L67

    Returns:
    - string (api_base) or None