mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
Merge pull request #9760 from BerriAI/litellm_prometheus_error_monitoring
[Reliability] Prometheus emit llm provider on failure metric - make it easy to differentiate litellm error vs llm api error
This commit is contained in:
commit
036fb01ea7
3 changed files with 57 additions and 11 deletions
|
@ -818,7 +818,7 @@ class PrometheusLogger(CustomLogger):
|
|||
requested_model=request_data.get("model", ""),
|
||||
status_code=str(getattr(original_exception, "status_code", None)),
|
||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||
exception_class=str(original_exception.__class__.__name__),
|
||||
exception_class=self._get_exception_class_name(original_exception),
|
||||
tags=_tags,
|
||||
)
|
||||
_labels = prometheus_label_factory(
|
||||
|
@ -917,7 +917,7 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base=api_base,
|
||||
api_provider=llm_provider,
|
||||
exception_status=str(getattr(exception, "status_code", None)),
|
||||
exception_class=exception.__class__.__name__,
|
||||
exception_class=self._get_exception_class_name(exception),
|
||||
requested_model=model_group,
|
||||
hashed_api_key=standard_logging_payload["metadata"][
|
||||
"user_api_key_hash"
|
||||
|
@ -1146,6 +1146,22 @@ class PrometheusLogger(CustomLogger):
|
|||
)
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def _get_exception_class_name(exception: Exception) -> str:
|
||||
exception_class_name = ""
|
||||
if hasattr(exception, "llm_provider"):
|
||||
exception_class_name = getattr(exception, "llm_provider") or ""
|
||||
|
||||
# pretty print the provider name on prometheus
|
||||
# eg. `openai` -> `Openai.`
|
||||
if len(exception_class_name) >= 1:
|
||||
exception_class_name = (
|
||||
exception_class_name[0].upper() + exception_class_name[1:] + "."
|
||||
)
|
||||
|
||||
exception_class_name += exception.__class__.__name__
|
||||
return exception_class_name
|
||||
|
||||
async def log_success_fallback_event(
|
||||
self, original_model_group: str, kwargs: dict, original_exception: Exception
|
||||
):
|
||||
|
@ -1181,7 +1197,7 @@ class PrometheusLogger(CustomLogger):
|
|||
team=standard_metadata["user_api_key_team_id"],
|
||||
team_alias=standard_metadata["user_api_key_team_alias"],
|
||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||
exception_class=str(original_exception.__class__.__name__),
|
||||
exception_class=self._get_exception_class_name(original_exception),
|
||||
tags=_tags,
|
||||
)
|
||||
_labels = prometheus_label_factory(
|
||||
|
@ -1225,7 +1241,7 @@ class PrometheusLogger(CustomLogger):
|
|||
team=standard_metadata["user_api_key_team_id"],
|
||||
team_alias=standard_metadata["user_api_key_team_alias"],
|
||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||
exception_class=str(original_exception.__class__.__name__),
|
||||
exception_class=self._get_exception_class_name(original_exception),
|
||||
tags=_tags,
|
||||
)
|
||||
|
||||
|
|
|
@ -713,7 +713,7 @@ async def test_async_post_call_failure_hook(prometheus_logger):
|
|||
team_alias="test_team_alias",
|
||||
user="test_user",
|
||||
exception_status="429",
|
||||
exception_class="RateLimitError",
|
||||
exception_class="Openai.RateLimitError",
|
||||
)
|
||||
prometheus_logger.litellm_proxy_failed_requests_metric.labels().inc.assert_called_once()
|
||||
|
||||
|
@ -948,7 +948,7 @@ async def test_log_success_fallback_event(prometheus_logger):
|
|||
team="test_team",
|
||||
team_alias="test_team_alias",
|
||||
exception_status="429",
|
||||
exception_class="RateLimitError",
|
||||
exception_class="Openai.RateLimitError",
|
||||
)
|
||||
prometheus_logger.litellm_deployment_successful_fallbacks.labels().inc.assert_called_once()
|
||||
|
||||
|
@ -985,7 +985,7 @@ async def test_log_failure_fallback_event(prometheus_logger):
|
|||
team="test_team",
|
||||
team_alias="test_team_alias",
|
||||
exception_status="429",
|
||||
exception_class="RateLimitError",
|
||||
exception_class="Openai.RateLimitError",
|
||||
)
|
||||
prometheus_logger.litellm_deployment_failed_fallbacks.labels().inc.assert_called_once()
|
||||
|
||||
|
@ -1500,3 +1500,33 @@ def test_set_team_budget_metrics_with_custom_labels(prometheus_logger, monkeypat
|
|||
"metadata_organization": None,
|
||||
"metadata_environment": None,
|
||||
}
|
||||
|
||||
|
||||
def test_get_exception_class_name(prometheus_logger):
|
||||
"""
|
||||
Test that _get_exception_class_name correctly formats the exception class name
|
||||
"""
|
||||
# Test case 1: Exception with llm_provider
|
||||
rate_limit_error = litellm.RateLimitError(
|
||||
message="Rate limit exceeded",
|
||||
llm_provider="openai",
|
||||
model="gpt-3.5-turbo"
|
||||
)
|
||||
assert prometheus_logger._get_exception_class_name(rate_limit_error) == "Openai.RateLimitError"
|
||||
|
||||
# Test case 2: Exception with empty llm_provider
|
||||
auth_error = litellm.AuthenticationError(
|
||||
message="Invalid API key",
|
||||
llm_provider="",
|
||||
model="gpt-4"
|
||||
)
|
||||
assert prometheus_logger._get_exception_class_name(auth_error) == "AuthenticationError"
|
||||
|
||||
# Test case 3: Exception with None llm_provider
|
||||
context_window_error = litellm.ContextWindowExceededError(
|
||||
message="Context length exceeded",
|
||||
llm_provider=None,
|
||||
model="gpt-4"
|
||||
)
|
||||
assert prometheus_logger._get_exception_class_name(context_window_error) == "ContextWindowExceededError"
|
||||
|
||||
|
|
|
@ -107,7 +107,7 @@ async def test_proxy_failure_metrics():
|
|||
print("/metrics", metrics)
|
||||
|
||||
# Check if the failure metric is present and correct
|
||||
expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'
|
||||
expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'
|
||||
|
||||
assert (
|
||||
expected_metric in metrics
|
||||
|
@ -121,7 +121,7 @@ async def test_proxy_failure_metrics():
|
|||
)
|
||||
|
||||
assert (
|
||||
'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}'
|
||||
'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}'
|
||||
in metrics
|
||||
)
|
||||
|
||||
|
@ -229,13 +229,13 @@ async def test_proxy_fallback_metrics():
|
|||
|
||||
# Check if successful fallback metric is incremented
|
||||
assert (
|
||||
'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
||||
'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
||||
in metrics
|
||||
)
|
||||
|
||||
# Check if failed fallback metric is incremented
|
||||
assert (
|
||||
'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
||||
'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
||||
in metrics
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue