(Feat) - Add PagerDuty Alerting Integration (#7478)

* define basic types * fix verbose_logger.exception statement * fix basic alerting * test pager duty alerting * test_pagerduty_alerting_high_failure_rate * PagerDutyAlerting * async_log_failure_event * use pre_call_hook * add _request_is_completed helper util * update AlertingConfig * rename PagerDutyInternalEvent * _send_alert_if_thresholds_crossed * use pagerduty as _custom_logger_compatible_callbacks_literal * fix slack alerting imports * fix imports in slack alerting * PagerDutyAlerting * fix _load_alerting_settings * test_pagerduty_hanging_request_alerting * working pager duty alerting * fix linting * doc pager duty alerting * update hanging_response_handler * fix import location * update failure_threshold * update async_pre_call_hook * docs pagerduty * test - callback_class_str_to_classType * fix linting errors * fix linting + testing error * PagerDutyAlerting * test_pagerduty_hanging_request_alerting * fix unused imports * docs pager duty * @pytest.mark.flaky(retries=6, delay=2) * test_model_info_bedrock_converse_enforcement
2025-04-26 03:04:13 +00:00 · 2025-01-01 07:12:51 -08:00 · 2025-01-01 07:12:51 -08:00 · a39cac313c
commit a39cac313c
parent 9af6ba0a02
15 changed files with 691 additions and 28 deletions
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -452,6 +452,7 @@ router_settings:
 | OTEL_HEADERS | Headers for OpenTelemetry requests
 | OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
 | OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
 | PAGERDUTY_API_KEY | API key for PagerDuty Alerting
 | POD_NAME | Pod name for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) as `POD_NAME` 
 | PREDIBASE_API_BASE | Base URL for Predibase API
 | PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
--- a/docs/my-website/docs/proxy/pagerduty.md
+++ b/docs/my-website/docs/proxy/pagerduty.md
@ -0,0 +1,106 @@
 import Image from '@theme/IdealImage';
 # PagerDuty Alerting
 :::info
 ✨ PagerDuty Alerting is on LiteLLM Enterprise
 [Enterprise Pricing](https://www.litellm.ai/#pricing)
 [Get free 7-day trial key](https://www.litellm.ai/#trial)
 :::
 Handles two types of alerts:
 - High LLM API Failure Rate. Configure X fails in Y seconds to trigger an alert.
 - High Number of Hanging LLM Requests. Configure X hangs in Y seconds to trigger an alert.
 ## Quick Start
 1. Set `PAGERDUTY_API_KEY="d8bxxxxx"` in your environment variables.
 ```
 PAGERDUTY_API_KEY="d8bxxxxx"
 ```
 2. Set PagerDuty Alerting in your config file.
 ```yaml
 model_list:
  - model_name: "openai/*"
    litellm_params:
      model: "openai/*"
      api_key: os.environ/OPENAI_API_KEY
 general_settings: 
  alerting: ["pagerduty"]
  alerting_args:
    failure_threshold: 1  # Number of requests failing in a window
    failure_threshold_window_seconds: 10  # Window in seconds
    # Requests hanging threshold
    hanging_threshold_seconds: 0.0000001  # Number of seconds of waiting for a response before a request is considered hanging
    hanging_threshold_window_seconds: 10  # Window in seconds
 ```
 3. Test it 
 Start LiteLLM Proxy
 ```shell
 litellm --config config.yaml
 ```
 ### LLM API Failure Alert
 Try sending a bad request to proxy 
 ```shell
 curl -i --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer sk-1234' \
 --data ' {
      "model": "gpt-4o",
      "user": "hi",
      "messages": [
        {
          "role": "user",
          "bad_param": "i like coffee"
        }
      ]
    }
 '
 ```
 <Image img={require('../../img/pagerduty_fail.png')} />
 ### LLM Hanging Alert
 Try sending a hanging request to proxy 
 Since our hanging threshold is 0.0000001 seconds, you should see an alert.
 ```shell
 curl -i --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer sk-1234' \
 --data ' {
      "model": "gpt-4o",
      "user": "hi",
      "messages": [
        {
          "role": "user",
          "content": "i like coffee"
        }
      ]
    }
 '
 ```
 <Image img={require('../../img/pagerduty_hanging.png')} />
--- a/docs/my-website/img/pagerduty_fail.png
+++ b/docs/my-website/img/pagerduty_fail.png
--- a/docs/my-website/img/pagerduty_hanging.png
+++ b/docs/my-website/img/pagerduty_hanging.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -118,7 +118,13 @@ const sidebars = {
        {
          type: "category",
          label: "Logging, Alerting, Metrics",
-          items: ["proxy/logging", "proxy/logging_spec", "proxy/team_logging","proxy/alerting", "proxy/prometheus"],
+          items: [
            "proxy/logging", 
            "proxy/logging_spec", 
            "proxy/team_logging",
            "proxy/prometheus", 
            "proxy/alerting", 
            "proxy/pagerduty"],
        },
        {
          type: "category",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -74,6 +74,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "argilla",
    "mlflow",
    "langfuse",
    "pagerduty",
    "humanloop",
 ]
 logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -6,7 +6,7 @@ import os
 import random
 import time
 from datetime import timedelta
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
 from openai import APIError
@ -25,13 +25,19 @@ from litellm.llms.custom_httpx.http_handler import (
    httpxSpecialProvider,
 )
 from litellm.proxy._types import AlertType, CallInfo, VirtualKeyEvent, WebhookEvent
 from litellm.router import Router
 from litellm.types.integrations.slack_alerting import *
 from ..email_templates.templates import *
 from .batching_handler import send_to_webhook, squash_payloads
 from .utils import _add_langfuse_trace_id_to_alert, process_slack_alerting_variables
 if TYPE_CHECKING:
    from litellm.router import Router as _Router
    Router = _Router
 else:
    Router = Any
 class SlackAlerting(CustomBatchLogger):
    """
@ -465,18 +471,10 @@ class SlackAlerting(CustomBatchLogger):
                self.alerting_threshold
            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
            alerting_metadata: dict = {}
-            if (
+            if await self._request_is_completed(request_data=request_data) is True:
-                request_data is not None
+                return
-                and request_data.get("litellm_status", "") != "success"
+
-                and request_data.get("litellm_status", "") != "fail"
+            if request_data is not None:
            ):
                ## CHECK IF CACHE IS UPDATED
                litellm_call_id = request_data.get("litellm_call_id", "")
                status: Optional[str] = await self.internal_usage_cache.async_get_cache(
                    key="request_status:{}".format(litellm_call_id), local_only=True
                )
                if status is not None and (status == "success" or status == "fail"):
                    return
                if request_data.get("deployment", None) is not None and isinstance(
                    request_data["deployment"], dict
                ):
@ -1753,3 +1751,23 @@ Model Info:
            )
        return
    async def _request_is_completed(self, request_data: Optional[dict]) -> bool:
        """
        Returns True if the request is completed - either as a success or failure
        """
        if request_data is None:
            return False
        if (
            request_data.get("litellm_status", "") != "success"
            and request_data.get("litellm_status", "") != "fail"
        ):
            ## CHECK IF CACHE IS UPDATED
            litellm_call_id = request_data.get("litellm_call_id", "")
            status: Optional[str] = await self.internal_usage_cache.async_get_cache(
                key="request_status:{}".format(litellm_call_id), local_only=True
            )
            if status is not None and (status == "success" or status == "fail"):
                return True
        return False
--- a/litellm/integrations/SlackAlerting/utils.py
+++ b/litellm/integrations/SlackAlerting/utils.py
@ -3,12 +3,18 @@ Utils used for slack alerting
 """
 import asyncio
-from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 from litellm.litellm_core_utils.litellm_logging import Logging
 from litellm.proxy._types import AlertType
 from litellm.secret_managers.main import get_secret
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as _Logging
    Logging = _Logging
 else:
    Logging = Any
 def process_slack_alerting_variables(
    alert_to_webhook_url: Optional[Dict[AlertType, Union[List[str], str]]]
--- a/litellm/integrations/pagerduty/pagerduty.py
+++ b/litellm/integrations/pagerduty/pagerduty.py
@ -0,0 +1,303 @@
 """
 PagerDuty Alerting Integration
 Handles two types of alerts:
 - High LLM API Failure Rate. Configure X fails in Y seconds to trigger an alert.
 - High Number of Hanging LLM Requests. Configure X hangs in Y seconds to trigger an alert.
 """
 import asyncio
 import os
 from datetime import datetime, timedelta, timezone
 from typing import List, Literal, Optional, Union
 from litellm._logging import verbose_logger
 from litellm.caching import DualCache
 from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    get_async_httpx_client,
    httpxSpecialProvider,
 )
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.integrations.pagerduty import (
    AlertingConfig,
    PagerDutyInternalEvent,
    PagerDutyPayload,
    PagerDutyRequestBody,
 )
 from litellm.types.utils import (
    StandardLoggingPayload,
    StandardLoggingPayloadErrorInformation,
 )
 PAGERDUTY_DEFAULT_FAILURE_THRESHOLD = 60
 PAGERDUTY_DEFAULT_FAILURE_THRESHOLD_WINDOW_SECONDS = 60
 PAGERDUTY_DEFAULT_HANGING_THRESHOLD_SECONDS = 60
 PAGERDUTY_DEFAULT_HANGING_THRESHOLD_WINDOW_SECONDS = 600
 class PagerDutyAlerting(SlackAlerting):
    """
    Tracks failed requests and hanging requests separately.
    If threshold is crossed for either type, triggers a PagerDuty alert.
    """
    def __init__(
        self, alerting_args: Optional[Union[AlertingConfig, dict]] = None, **kwargs
    ):
        from litellm.proxy.proxy_server import CommonProxyErrors, premium_user
        super().__init__()
        _api_key = os.getenv("PAGERDUTY_API_KEY")
        if not _api_key:
            raise ValueError("PAGERDUTY_API_KEY is not set")
        self.api_key: str = _api_key
        alerting_args = alerting_args or {}
        self.alerting_args: AlertingConfig = AlertingConfig(
            failure_threshold=alerting_args.get(
                "failure_threshold", PAGERDUTY_DEFAULT_FAILURE_THRESHOLD
            ),
            failure_threshold_window_seconds=alerting_args.get(
                "failure_threshold_window_seconds",
                PAGERDUTY_DEFAULT_FAILURE_THRESHOLD_WINDOW_SECONDS,
            ),
            hanging_threshold_seconds=alerting_args.get(
                "hanging_threshold_seconds", PAGERDUTY_DEFAULT_HANGING_THRESHOLD_SECONDS
            ),
            hanging_threshold_window_seconds=alerting_args.get(
                "hanging_threshold_window_seconds",
                PAGERDUTY_DEFAULT_HANGING_THRESHOLD_WINDOW_SECONDS,
            ),
        )
        # Separate storage for failures vs. hangs
        self._failure_events: List[PagerDutyInternalEvent] = []
        self._hanging_events: List[PagerDutyInternalEvent] = []
        # premium user check
        if premium_user is not True:
            raise ValueError(
                f"PagerDutyAlerting is only available for LiteLLM Enterprise users. {CommonProxyErrors.not_premium_user.value}"
            )
    # ------------------ MAIN LOGIC ------------------ #
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        """
        Record a failure event. Only send an alert to PagerDuty if the
        configured *failure* threshold is exceeded in the specified window.
        """
        now = datetime.now(timezone.utc)
        standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
            "standard_logging_object"
        )
        if not standard_logging_payload:
            raise ValueError(
                "standard_logging_object is required for PagerDutyAlerting"
            )
        # Extract error details
        error_info: Optional[StandardLoggingPayloadErrorInformation] = (
            standard_logging_payload.get("error_information") or {}
        )
        _meta = standard_logging_payload.get("metadata") or {}
        self._failure_events.append(
            PagerDutyInternalEvent(
                failure_event_type="failed_response",
                timestamp=now,
                error_class=error_info.get("error_class"),
                error_code=error_info.get("error_code"),
                error_llm_provider=error_info.get("llm_provider"),
                user_api_key_hash=_meta.get("user_api_key_hash"),
                user_api_key_alias=_meta.get("user_api_key_alias"),
                user_api_key_org_id=_meta.get("user_api_key_org_id"),
                user_api_key_team_id=_meta.get("user_api_key_team_id"),
                user_api_key_user_id=_meta.get("user_api_key_user_id"),
                user_api_key_team_alias=_meta.get("user_api_key_team_alias"),
                user_api_key_end_user_id=_meta.get("user_api_key_end_user_id"),
            )
        )
        # Prune + Possibly alert
        window_seconds = self.alerting_args.get("failure_threshold_window_seconds", 60)
        threshold = self.alerting_args.get("failure_threshold", 1)
        # If threshold is crossed, send PD alert for failures
        await self._send_alert_if_thresholds_crossed(
            events=self._failure_events,
            window_seconds=window_seconds,
            threshold=threshold,
            alert_prefix="High LLM API Failure Rate",
        )
    async def async_pre_call_hook(
        self,
        user_api_key_dict: UserAPIKeyAuth,
        cache: DualCache,
        data: dict,
        call_type: Literal[
            "completion",
            "text_completion",
            "embeddings",
            "image_generation",
            "moderation",
            "audio_transcription",
            "pass_through_endpoint",
            "rerank",
        ],
    ) -> Optional[Union[Exception, str, dict]]:
        """
        Example of detecting hanging requests by waiting a given threshold.
        If the request didn't finish by then, we treat it as 'hanging'.
        """
        verbose_logger.info("Inside Proxy Logging Pre-call hook!")
        asyncio.create_task(
            self.hanging_response_handler(
                request_data=data, user_api_key_dict=user_api_key_dict
            )
        )
        return None
    async def hanging_response_handler(
        self, request_data: Optional[dict], user_api_key_dict: UserAPIKeyAuth
    ):
        """
        Checks if request completed by the time 'hanging_threshold_seconds' elapses.
        If not, we classify it as a hanging request.
        """
        verbose_logger.debug(
            f"Inside Hanging Response Handler!..sleeping for {self.alerting_args.get('hanging_threshold_seconds', PAGERDUTY_DEFAULT_HANGING_THRESHOLD_SECONDS)} seconds"
        )
        await asyncio.sleep(
            self.alerting_args.get(
                "hanging_threshold_seconds", PAGERDUTY_DEFAULT_HANGING_THRESHOLD_SECONDS
            )
        )
        if await self._request_is_completed(request_data=request_data):
            return  # It's not hanging if completed
        # Otherwise, record it as hanging
        self._hanging_events.append(
            PagerDutyInternalEvent(
                failure_event_type="hanging_response",
                timestamp=datetime.now(timezone.utc),
                error_class="HangingRequest",
                error_code="HangingRequest",
                error_llm_provider="HangingRequest",
                user_api_key_hash=user_api_key_dict.api_key,
                user_api_key_alias=user_api_key_dict.key_alias,
                user_api_key_org_id=user_api_key_dict.org_id,
                user_api_key_team_id=user_api_key_dict.team_id,
                user_api_key_user_id=user_api_key_dict.user_id,
                user_api_key_team_alias=user_api_key_dict.team_alias,
                user_api_key_end_user_id=user_api_key_dict.end_user_id,
            )
        )
        # Prune + Possibly alert
        window_seconds = self.alerting_args.get(
            "hanging_threshold_window_seconds",
            PAGERDUTY_DEFAULT_HANGING_THRESHOLD_WINDOW_SECONDS,
        )
        threshold: int = self.alerting_args.get(
            "hanging_threshold_fails", PAGERDUTY_DEFAULT_HANGING_THRESHOLD_SECONDS
        )
        # If threshold is crossed, send PD alert for hangs
        await self._send_alert_if_thresholds_crossed(
            events=self._hanging_events,
            window_seconds=window_seconds,
            threshold=threshold,
            alert_prefix="High Number of Hanging LLM Requests",
        )
    # ------------------ HELPERS ------------------ #
    async def _send_alert_if_thresholds_crossed(
        self,
        events: List[PagerDutyInternalEvent],
        window_seconds: int,
        threshold: int,
        alert_prefix: str,
    ):
        """
        1. Prune old events
        2. If threshold is reached, build alert, send to PagerDuty
        3. Clear those events
        """
        cutoff = datetime.now(timezone.utc) - timedelta(seconds=window_seconds)
        pruned = [e for e in events if e.get("timestamp", datetime.min) > cutoff]
        # Update the reference list
        events.clear()
        events.extend(pruned)
        # Check threshold
        verbose_logger.debug(
            f"Have {len(events)} events in the last {window_seconds} seconds. Threshold is {threshold}"
        )
        if len(events) >= threshold:
            # Build short summary of last N events
            error_summaries = self._build_error_summaries(events, max_errors=5)
            alert_message = (
                f"{alert_prefix}: {len(events)} in the last {window_seconds} seconds."
            )
            custom_details = {"recent_errors": error_summaries}
            await self.send_alert_to_pagerduty(
                alert_message=alert_message,
                custom_details=custom_details,
            )
            # Clear them after sending an alert, so we don't spam
            events.clear()
    def _build_error_summaries(
        self, events: List[PagerDutyInternalEvent], max_errors: int = 5
    ) -> List[PagerDutyInternalEvent]:
        """
        Build short text summaries for the last `max_errors`.
        Example: "ValueError (code: 500, provider: openai)"
        """
        recent = events[-max_errors:]
        summaries = []
        for fe in recent:
            # If any of these is None, show "N/A" to avoid messing up the summary string
            fe.pop("timestamp")
            summaries.append(fe)
        return summaries
    async def send_alert_to_pagerduty(self, alert_message: str, custom_details: dict):
        """
        Send [critical] Alert to PagerDuty
        https://developer.pagerduty.com/api-reference/YXBpOjI3NDgyNjU-pager-duty-v2-events-api
        """
        try:
            verbose_logger.debug(f"Sending alert to PagerDuty: {alert_message}")
            async_client: AsyncHTTPHandler = get_async_httpx_client(
                llm_provider=httpxSpecialProvider.LoggingCallback
            )
            payload: PagerDutyRequestBody = PagerDutyRequestBody(
                payload=PagerDutyPayload(
                    summary=alert_message,
                    severity="critical",
                    source="LiteLLM Alert",
                    component="LiteLLM",
                    custom_details=custom_details,
                ),
                routing_key=self.api_key,
                event_action="trigger",
            )
            return await async_client.post(
                url="https://events.pagerduty.com/v2/enqueue",
                json=dict(payload),
                headers={"Content-Type": "application/json"},
            )
        except Exception as e:
            verbose_logger.exception(f"Error sending alert to PagerDuty: {e}")
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -30,6 +30,7 @@ from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.integrations.mlflow import MlflowLogger
 from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
 from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
    redact_message_input_output_from_logging,
@ -1992,7 +1993,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    )
            except Exception as e:
                verbose_logger.exception(
-                    "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success \
+                    "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure \
                        logging {}\nCallback={}".format(
                        str(e), callback
                    )
@ -2163,7 +2164,12 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
    llm_router: Optional[
        Any
    ],  # expect litellm.Router, but typing errors due to circular import
    custom_logger_init_args: Optional[dict] = {},
 ) -> Optional[CustomLogger]:
    """
    Initialize a custom logger compatible class
    """
    custom_logger_init_args = custom_logger_init_args or {}
    if logging_integration == "lago":
        for callback in _in_memory_loggers:
            if isinstance(callback, LagoLogger):
@ -2386,6 +2392,13 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        langfuse_logger = LangfusePromptManagement()
        _in_memory_loggers.append(langfuse_logger)
        return langfuse_logger  # type: ignore
    elif logging_integration == "pagerduty":
        for callback in _in_memory_loggers:
            if isinstance(callback, PagerDutyAlerting):
                return callback
        pagerduty_logger = PagerDutyAlerting(**custom_logger_init_args)
        _in_memory_loggers.append(pagerduty_logger)
        return pagerduty_logger  # type: ignore
    elif logging_integration == "humanloop":
        for callback in _in_memory_loggers:
            if isinstance(callback, HumanloopLogger):
@ -2509,6 +2522,10 @@ def get_custom_logger_compatible_class(  # noqa: PLR0915
        for callback in _in_memory_loggers:
            if isinstance(callback, MlflowLogger):
                return callback
    elif logging_integration == "pagerduty":
        for callback in _in_memory_loggers:
            if isinstance(callback, PagerDutyAlerting):
                return callback
    return None
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -11,9 +11,21 @@ litellm_settings:
  callbacks: ["datadog"]
 general_settings: 
  alerting: ["pagerduty"]
  alerting_args:
    failure_threshold: 4  # Number of requests failing in a window
    failure_threshold_window_seconds: 10  # Window in seconds
    # Requests hanging threshold
    hanging_threshold_seconds: 0.0000001  # Number of seconds of waiting for a response before a request is considered hanging
    hanging_threshold_window_seconds: 10  # Window in seconds
 # For /fine_tuning/jobs endpoints
 finetune_settings:
  - custom_llm_provider: "vertex_ai"
    vertex_project: "adroit-crow-413218"
    vertex_location: "us-central1"
-    vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
+    vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1939,15 +1939,7 @@ class ProxyConfig:
            use_azure_key_vault = general_settings.get("use_azure_key_vault", False)
            load_from_azure_key_vault(use_azure_key_vault=use_azure_key_vault)
            ### ALERTING ###
-
+            self._load_alerting_settings(general_settings=general_settings)
            proxy_logging_obj.update_values(
                alerting=general_settings.get("alerting", None),
                alerting_threshold=general_settings.get("alerting_threshold", 600),
                alert_types=general_settings.get("alert_types", None),
                alert_to_webhook_url=general_settings.get("alert_to_webhook_url", None),
                alerting_args=general_settings.get("alerting_args", None),
                redis_cache=redis_usage_cache,
            )
            ### CONNECT TO DATABASE ###
            database_url = general_settings.get("database_url", None)
            if database_url and database_url.startswith("os.environ/"):
@ -2135,6 +2127,46 @@ class ProxyConfig:
            )
        return router, router.get_model_list(), general_settings
    def _load_alerting_settings(self, general_settings: dict):
        """
        Initialize alerting settings
        """
        from litellm.litellm_core_utils.litellm_logging import (
            _init_custom_logger_compatible_class,
        )
        _alerting_callbacks = general_settings.get("alerting", None)
        verbose_proxy_logger.debug(f"_alerting_callbacks: {general_settings}")
        if _alerting_callbacks is None:
            return
        for _alert in _alerting_callbacks:
            if _alert == "slack":
                # [OLD] v0 implementation
                proxy_logging_obj.update_values(
                    alerting=general_settings.get("alerting", None),
                    alerting_threshold=general_settings.get("alerting_threshold", 600),
                    alert_types=general_settings.get("alert_types", None),
                    alert_to_webhook_url=general_settings.get(
                        "alert_to_webhook_url", None
                    ),
                    alerting_args=general_settings.get("alerting_args", None),
                    redis_cache=redis_usage_cache,
                )
            else:
                # [NEW] v1 implementation - init as a custom logger
                if _alert in litellm._known_custom_logger_compatible_callbacks:
                    _logger = _init_custom_logger_compatible_class(
                        logging_integration=_alert,
                        internal_usage_cache=None,
                        llm_router=None,
                        custom_logger_init_args={
                            "alerting_args": general_settings.get("alerting_args", None)
                        },
                    )
                    if _logger is not None:
                        litellm.callbacks.append(_logger)
        pass
    def get_model_info_with_id(self, model, db_model=False) -> RouterModelInfo:
        """
        Common logic across add + delete router models
--- a/litellm/types/integrations/pagerduty.py
+++ b/litellm/types/integrations/pagerduty.py
@ -0,0 +1,62 @@
 from datetime import datetime
 from typing import List, Literal, Optional, TypedDict, Union
 from litellm.types.utils import StandardLoggingUserAPIKeyMetadata
 class LinkDict(TypedDict, total=False):
    href: str
    text: Optional[str]
 class ImageDict(TypedDict, total=False):
    src: str
    href: Optional[str]
    alt: Optional[str]
 class PagerDutyPayload(TypedDict, total=False):
    summary: str
    timestamp: Optional[str]  # ISO 8601 date-time format
    severity: Literal["critical", "warning", "error", "info"]
    source: str
    component: Optional[str]
    group: Optional[str]
    class_: Optional[str]  # Using class_ since 'class' is a reserved keyword
    custom_details: Optional[dict]
 class PagerDutyRequestBody(TypedDict, total=False):
    payload: PagerDutyPayload
    routing_key: str
    event_action: Literal["trigger", "acknowledge", "resolve"]
    dedup_key: Optional[str]
    client: Optional[str]
    client_url: Optional[str]
    links: Optional[List[LinkDict]]
    images: Optional[List[ImageDict]]
 class AlertingConfig(TypedDict, total=False):
    """
    Config for alerting thresholds
    """
    # Requests failing threshold
    failure_threshold: int  # Number of requests failing in a window
    failure_threshold_window_seconds: int  # Window in seconds
    # Requests hanging threshold
    hanging_threshold_seconds: float  # Number of seconds of waiting for a response before a request is considered hanging
    hanging_threshold_fails: int  # Number of requests hanging in a window
    hanging_threshold_window_seconds: int  # Window in seconds
 class PagerDutyInternalEvent(StandardLoggingUserAPIKeyMetadata, total=False):
    """Simple structure to hold timestamp and error info."""
    failure_event_type: Literal["failed_response", "hanging_response"]
    timestamp: datetime
    error_class: Optional[str]
    error_code: Optional[str]
    error_llm_provider: Optional[str]
--- a/tests/logging_callback_tests/test_pagerduty_alerting.py
+++ b/tests/logging_callback_tests/test_pagerduty_alerting.py
@ -0,0 +1,96 @@
 import asyncio
 import os
 import random
 import sys
 from datetime import datetime, timedelta
 from typing import Optional
 sys.path.insert(0, os.path.abspath("../.."))
 import pytest
 import litellm
 from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting, AlertingConfig
 from litellm.proxy._types import UserAPIKeyAuth
@pytest.mark.asyncio
 async def test_pagerduty_alerting():
    pagerduty = PagerDutyAlerting(
        alerting_args=AlertingConfig(
            failure_threshold=1, failure_threshold_window_seconds=10
        )
    )
    litellm.callbacks = [pagerduty]
    try:
        await litellm.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "hi"}],
            mock_response="litellm.RateLimitError",
        )
    except litellm.RateLimitError:
        pass
    await asyncio.sleep(2)
@pytest.mark.asyncio
 async def test_pagerduty_alerting_high_failure_rate():
    pagerduty = PagerDutyAlerting(
        alerting_args=AlertingConfig(
            failure_threshold=3, failure_threshold_window_seconds=600
        )
    )
    litellm.callbacks = [pagerduty]
    try:
        await litellm.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "hi"}],
            mock_response="litellm.RateLimitError",
        )
    except litellm.RateLimitError:
        pass
    await asyncio.sleep(2)
    # make 3 more fails
    for _ in range(3):
        try:
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "hi"}],
                mock_response="litellm.RateLimitError",
            )
        except litellm.RateLimitError:
            pass
    await asyncio.sleep(2)
@pytest.mark.asyncio
 async def test_pagerduty_hanging_request_alerting():
    pagerduty = PagerDutyAlerting(
        alerting_args=AlertingConfig(hanging_threshold_seconds=0.0000001)
    )
    litellm.callbacks = [pagerduty]
    await pagerduty.async_pre_call_hook(
        cache=None,
        user_api_key_dict=UserAPIKeyAuth(
            api_key="test",
            key_alias="test-pagerduty",
            team_alias="test-team",
            org_id="test-org",
            user_id="test-user",
            end_user_id="test-end-user",
        ),
        data={"model": "gpt-4o", "messages": [{"role": "user", "content": "hi"}]},
        call_type="completion",
    )
    await litellm.acompletion(
        model="gpt-4o",
        messages=[{"role": "user", "content": "hi"}],
    )
    await asyncio.sleep(1)
--- a/tests/logging_callback_tests/test_unit_tests_init_callbacks.py
+++ b/tests/logging_callback_tests/test_unit_tests_init_callbacks.py
@ -20,6 +20,7 @@ from prometheus_client import REGISTRY, CollectorRegistry
 from litellm.integrations.lago import LagoLogger
 from litellm.integrations.openmeter import OpenMeterLogger
 from litellm.integrations.braintrust_logging import BraintrustLogger
 from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
 from litellm.integrations.galileo import GalileoObserve
 from litellm.integrations.langsmith import LangsmithLogger
 from litellm.integrations.literal_ai import LiteralAILogger
@ -68,6 +69,7 @@ callback_class_str_to_classType = {
    "mlflow": MlflowLogger,
    "langfuse": LangfusePromptManagement,
    "otel": OpenTelemetry,
    "pagerduty": PagerDutyAlerting,
 }
 expected_env_vars = {
@ -87,6 +89,7 @@ expected_env_vars = {
    "ARIZE_SPACE_KEY": "arize_space_key",
    "ARIZE_API_KEY": "arize_api_key",
    "ARGILLA_API_KEY": "argilla_api_key",
    "PAGERDUTY_API_KEY": "pagerduty_api_key",
 }