diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 7c23ef86f4..6fba69d005 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -818,7 +818,7 @@ class PrometheusLogger(CustomLogger): requested_model=request_data.get("model", ""), status_code=str(getattr(original_exception, "status_code", None)), exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) _labels = prometheus_label_factory( @@ -917,7 +917,7 @@ class PrometheusLogger(CustomLogger): api_base=api_base, api_provider=llm_provider, exception_status=str(getattr(exception, "status_code", None)), - exception_class=exception.__class__.__name__, + exception_class=self._get_exception_class_name(exception), requested_model=model_group, hashed_api_key=standard_logging_payload["metadata"][ "user_api_key_hash" @@ -1146,6 +1146,22 @@ class PrometheusLogger(CustomLogger): ) return + @staticmethod + def _get_exception_class_name(exception: Exception) -> str: + exception_class_name = "" + if hasattr(exception, "llm_provider"): + exception_class_name = getattr(exception, "llm_provider") or "" + + # pretty print the provider name on prometheus + # eg. `openai` -> `Openai.` + if len(exception_class_name) >= 1: + exception_class_name = ( + exception_class_name[0].upper() + exception_class_name[1:] + "." + ) + + exception_class_name += exception.__class__.__name__ + return exception_class_name + async def log_success_fallback_event( self, original_model_group: str, kwargs: dict, original_exception: Exception ): @@ -1181,7 +1197,7 @@ class PrometheusLogger(CustomLogger): team=standard_metadata["user_api_key_team_id"], team_alias=standard_metadata["user_api_key_team_alias"], exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) _labels = prometheus_label_factory( @@ -1225,7 +1241,7 @@ class PrometheusLogger(CustomLogger): team=standard_metadata["user_api_key_team_id"], team_alias=standard_metadata["user_api_key_team_alias"], exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py index 6bc5b42c45..ddfce710d7 100644 --- a/tests/logging_callback_tests/test_prometheus_unit_tests.py +++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py @@ -713,7 +713,7 @@ async def test_async_post_call_failure_hook(prometheus_logger): team_alias="test_team_alias", user="test_user", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_proxy_failed_requests_metric.labels().inc.assert_called_once() @@ -948,7 +948,7 @@ async def test_log_success_fallback_event(prometheus_logger): team="test_team", team_alias="test_team_alias", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_deployment_successful_fallbacks.labels().inc.assert_called_once() @@ -985,7 +985,7 @@ async def test_log_failure_fallback_event(prometheus_logger): team="test_team", team_alias="test_team_alias", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_deployment_failed_fallbacks.labels().inc.assert_called_once() @@ -1500,3 +1500,33 @@ def test_set_team_budget_metrics_with_custom_labels(prometheus_logger, monkeypat "metadata_organization": None, "metadata_environment": None, } + + +def test_get_exception_class_name(prometheus_logger): + """ + Test that _get_exception_class_name correctly formats the exception class name + """ + # Test case 1: Exception with llm_provider + rate_limit_error = litellm.RateLimitError( + message="Rate limit exceeded", + llm_provider="openai", + model="gpt-3.5-turbo" + ) + assert prometheus_logger._get_exception_class_name(rate_limit_error) == "Openai.RateLimitError" + + # Test case 2: Exception with empty llm_provider + auth_error = litellm.AuthenticationError( + message="Invalid API key", + llm_provider="", + model="gpt-4" + ) + assert prometheus_logger._get_exception_class_name(auth_error) == "AuthenticationError" + + # Test case 3: Exception with None llm_provider + context_window_error = litellm.ContextWindowExceededError( + message="Context length exceeded", + llm_provider=None, + model="gpt-4" + ) + assert prometheus_logger._get_exception_class_name(context_window_error) == "ContextWindowExceededError" + diff --git a/tests/otel_tests/test_prometheus.py b/tests/otel_tests/test_prometheus.py index 932ae0bbe7..9cae5c565f 100644 --- a/tests/otel_tests/test_prometheus.py +++ b/tests/otel_tests/test_prometheus.py @@ -107,7 +107,7 @@ async def test_proxy_failure_metrics(): print("/metrics", metrics) # Check if the failure metric is present and correct - expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0' + expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0' assert ( expected_metric in metrics @@ -121,7 +121,7 @@ async def test_proxy_failure_metrics(): ) assert ( - 'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}' + 'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}' in metrics ) @@ -229,13 +229,13 @@ async def test_proxy_fallback_metrics(): # Check if successful fallback metric is incremented assert ( - 'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' + 'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' in metrics ) # Check if failed fallback metric is incremented assert ( - 'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' + 'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' in metrics )