Merge pull request #9760 from BerriAI/litellm_prometheus_error_monitoring

[Reliability] Prometheus emit llm provider on failure metric - make it easy to differentiate litellm error vs llm api error
2025-04-27 11:43:54 +00:00 · 2025-04-04 21:37:28 -07:00 · 2025-04-04 21:37:28 -07:00 · 036fb01ea7
commit 036fb01ea7
parent 2b3ab0c492 b0d8328625
3 changed files with 57 additions and 11 deletions
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -818,7 +818,7 @@ class PrometheusLogger(CustomLogger):
                requested_model=request_data.get("model", ""),
                status_code=str(getattr(original_exception, "status_code", None)),
                exception_status=str(getattr(original_exception, "status_code", None)),
-                exception_class=str(original_exception.__class__.__name__),
+                exception_class=self._get_exception_class_name(original_exception),
                tags=_tags,
            )
            _labels = prometheus_label_factory(
@ -917,7 +917,7 @@ class PrometheusLogger(CustomLogger):
                api_base=api_base,
                api_provider=llm_provider,
                exception_status=str(getattr(exception, "status_code", None)),
-                exception_class=exception.__class__.__name__,
+                exception_class=self._get_exception_class_name(exception),
                requested_model=model_group,
                hashed_api_key=standard_logging_payload["metadata"][
                    "user_api_key_hash"
@ -1146,6 +1146,22 @@ class PrometheusLogger(CustomLogger):
            )
            return

+    @staticmethod
+    def _get_exception_class_name(exception: Exception) -> str:
+        exception_class_name = ""
+        if hasattr(exception, "llm_provider"):
+            exception_class_name = getattr(exception, "llm_provider") or ""
+
+        # pretty print the provider name on prometheus
+        # eg. `openai` -> `Openai.`
+        if len(exception_class_name) >= 1:
+            exception_class_name = (
+                exception_class_name[0].upper() + exception_class_name[1:] + "."
+            )
+
+        exception_class_name += exception.__class__.__name__
+        return exception_class_name
+
    async def log_success_fallback_event(
        self, original_model_group: str, kwargs: dict, original_exception: Exception
    ):
@ -1181,7 +1197,7 @@ class PrometheusLogger(CustomLogger):
            team=standard_metadata["user_api_key_team_id"],
            team_alias=standard_metadata["user_api_key_team_alias"],
            exception_status=str(getattr(original_exception, "status_code", None)),
-            exception_class=str(original_exception.__class__.__name__),
+            exception_class=self._get_exception_class_name(original_exception),
            tags=_tags,
        )
        _labels = prometheus_label_factory(
@ -1225,7 +1241,7 @@ class PrometheusLogger(CustomLogger):
            team=standard_metadata["user_api_key_team_id"],
            team_alias=standard_metadata["user_api_key_team_alias"],
            exception_status=str(getattr(original_exception, "status_code", None)),
-            exception_class=str(original_exception.__class__.__name__),
+            exception_class=self._get_exception_class_name(original_exception),
            tags=_tags,
        )

--- a/tests/logging_callback_tests/test_prometheus_unit_tests.py
+++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py
@ -713,7 +713,7 @@ async def test_async_post_call_failure_hook(prometheus_logger):
        team_alias="test_team_alias",
        user="test_user",
        exception_status="429",
-        exception_class="RateLimitError",
+        exception_class="Openai.RateLimitError",
    )
    prometheus_logger.litellm_proxy_failed_requests_metric.labels().inc.assert_called_once()

@ -948,7 +948,7 @@ async def test_log_success_fallback_event(prometheus_logger):
        team="test_team",
        team_alias="test_team_alias",
        exception_status="429",
-        exception_class="RateLimitError",
+        exception_class="Openai.RateLimitError",
    )
    prometheus_logger.litellm_deployment_successful_fallbacks.labels().inc.assert_called_once()

@ -985,7 +985,7 @@ async def test_log_failure_fallback_event(prometheus_logger):
        team="test_team",
        team_alias="test_team_alias",
        exception_status="429",
-        exception_class="RateLimitError",
+        exception_class="Openai.RateLimitError",
    )
    prometheus_logger.litellm_deployment_failed_fallbacks.labels().inc.assert_called_once()

@ -1500,3 +1500,33 @@ def test_set_team_budget_metrics_with_custom_labels(prometheus_logger, monkeypat
        "metadata_organization": None,
        "metadata_environment": None,
    }
+
+
+def test_get_exception_class_name(prometheus_logger):
+    """
+    Test that _get_exception_class_name correctly formats the exception class name
+    """
+    # Test case 1: Exception with llm_provider
+    rate_limit_error = litellm.RateLimitError(
+        message="Rate limit exceeded",
+        llm_provider="openai",
+        model="gpt-3.5-turbo"
+    )
+    assert prometheus_logger._get_exception_class_name(rate_limit_error) == "Openai.RateLimitError"
+
+    # Test case 2: Exception with empty llm_provider
+    auth_error = litellm.AuthenticationError(
+        message="Invalid API key",
+        llm_provider="",
+        model="gpt-4"
+    )
+    assert prometheus_logger._get_exception_class_name(auth_error) == "AuthenticationError"
+
+    # Test case 3: Exception with None llm_provider
+    context_window_error = litellm.ContextWindowExceededError(
+        message="Context length exceeded",
+        llm_provider=None,
+        model="gpt-4"
+    )
+    assert prometheus_logger._get_exception_class_name(context_window_error) == "ContextWindowExceededError"
+
--- a/tests/otel_tests/test_prometheus.py
+++ b/tests/otel_tests/test_prometheus.py
@ -107,7 +107,7 @@ async def test_proxy_failure_metrics():
        print("/metrics", metrics)

        # Check if the failure metric is present and correct
-        expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'
+        expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'

        assert (
            expected_metric in metrics
@ -121,7 +121,7 @@ async def test_proxy_failure_metrics():
        )

        assert (
-            'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}'
+            'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}'
            in metrics
        )

@ -229,13 +229,13 @@ async def test_proxy_fallback_metrics():

        # Check if successful fallback metric is incremented
        assert (
-            'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
+            'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
            in metrics
        )

        # Check if failed fallback metric is incremented
        assert (
-            'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
+            'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
            in metrics
        )