Add 'end_user', 'user' and 'requested_model' on more prometheus metrics (#7399)

* fix(prometheus.py): support streaming end user litellm_proxy_total_requests_metric tracking * fix(prometheus.py): add 'requested_model' and 'end_user_id' to 'litellm_request_total_latency_metric_bucket' enables latency tracking by end user + requested model * fix(prometheus.py): add end user, user and requested model metrics to 'litellm_llm_api_latency_metric' * test: update prometheus unit tests * test(test_prometheus.py): update tests * test(test_prometheus.py): fix test * test: reorder test
2025-04-26 19:24:27 +00:00 · 2024-12-24 14:08:30 -08:00 · 2024-12-24 14:08:30 -08:00 · 78fe124c14
commit 78fe124c14
parent bd4ab1449e
8 changed files with 114 additions and 31 deletions
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -69,11 +69,14 @@ class PrometheusLogger(CustomLogger):
                "litellm_request_total_latency_metric",
                "Total latency (seconds) for a request to LiteLLM",
                labelnames=[
-                    "model",
+                    UserAPIKeyLabelNames.END_USER.value,
-                    "hashed_api_key",
+                    UserAPIKeyLabelNames.API_KEY_HASH.value,
-                    "api_key_alias",
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value,
-                    "team",
+                    REQUESTED_MODEL,
-                    "team_alias",
+                    UserAPIKeyLabelNames.TEAM.value,
                    UserAPIKeyLabelNames.TEAM_ALIAS.value,
                    UserAPIKeyLabelNames.USER.value,
                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
                ],
                buckets=LATENCY_BUCKETS,
            )
@ -82,11 +85,14 @@ class PrometheusLogger(CustomLogger):
                "litellm_llm_api_latency_metric",
                "Total latency (seconds) for a models LLM API call",
                labelnames=[
-                    "model",
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
-                    "hashed_api_key",
+                    UserAPIKeyLabelNames.API_KEY_HASH.value,
-                    "api_key_alias",
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value,
-                    "team",
+                    UserAPIKeyLabelNames.TEAM.value,
-                    "team_alias",
+                    UserAPIKeyLabelNames.TEAM_ALIAS.value,
                    UserAPIKeyLabelNames.REQUESTED_MODEL.value,
                    UserAPIKeyLabelNames.END_USER.value,
                    UserAPIKeyLabelNames.USER.value,
                ],
                buckets=LATENCY_BUCKETS,
            )
@ -447,7 +453,20 @@ class PrometheusLogger(CustomLogger):
        self.set_llm_deployment_success_metrics(
            kwargs, start_time, end_time, output_tokens
        )
-        pass
+
        if (
            standard_logging_payload["stream"] is True
        ):  # log successful streaming requests from logging event hook.
            self.litellm_proxy_total_requests_metric.labels(
                end_user=end_user_id,
                hashed_api_key=user_api_key,
                api_key_alias=user_api_key_alias,
                requested_model=model,
                team=user_api_team,
                team_alias=user_api_team_alias,
                user=user_id,
                status_code="200",
            ).inc()
    def _increment_token_metrics(
        self,
@ -631,23 +650,44 @@ class PrometheusLogger(CustomLogger):
            api_call_total_time: timedelta = end_time - api_call_start_time
            api_call_total_time_seconds = api_call_total_time.total_seconds()
            self.litellm_llm_api_latency_metric.labels(
-                model,
+                **{
-                user_api_key,
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
-                user_api_key_alias,
+                    UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
-                user_api_team,
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
-                user_api_team_alias,
+                    UserAPIKeyLabelNames.TEAM.value: user_api_team,
                    UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
                    UserAPIKeyLabelNames.USER.value: standard_logging_payload[
                        "metadata"
                    ]["user_api_key_user_id"],
                    UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
                        "metadata"
                    ]["user_api_key_end_user_id"],
                    UserAPIKeyLabelNames.REQUESTED_MODEL.value: standard_logging_payload[
                        "model_group"
                    ],
                }
            ).observe(api_call_total_time_seconds)
        # total request latency
        if start_time is not None and isinstance(start_time, datetime):
            total_time: timedelta = end_time - start_time
            total_time_seconds = total_time.total_seconds()
            self.litellm_request_total_latency_metric.labels(
-                model,
+                **{
-                user_api_key,
+                    UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
-                user_api_key_alias,
+                        "metadata"
-                user_api_team,
+                    ]["user_api_key_end_user_id"],
-                user_api_team_alias,
+                    UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
                    UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
                    REQUESTED_MODEL: standard_logging_payload["model_group"],
                    UserAPIKeyLabelNames.TEAM.value: user_api_team,
                    UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
                    UserAPIKeyLabelNames.USER.value: standard_logging_payload[
                        "metadata"
                    ]["user_api_key_user_id"],
                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
                }
            ).observe(total_time_seconds)
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -2961,11 +2961,19 @@ def get_standard_logging_object_payload(
            kwargs=kwargs,
        )
        stream: Optional[bool] = None
        if (
            kwargs.get("complete_streaming_response") is not None
            or kwargs.get("async_complete_streaming_response") is not None
        ):
            stream = True
        payload: StandardLoggingPayload = StandardLoggingPayload(
            id=str(id),
            trace_id=kwargs.get("litellm_trace_id"),  # type: ignore
            call_type=call_type or "",
            cache_hit=cache_hit,
            stream=stream,
            status=status,
            saved_cache_cost=saved_cache_cost,
            startTime=start_time_float,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,8 +1,12 @@
 model_list:
-  - model_name: whisper
+  - model_name: openai/*
    litellm_params:
-      model: whisper-1
+      model: openai/*
      api_key: os.environ/OPENAI_API_KEY
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/gpt-3.5-turbo
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      mode: audio_transcription
 litellm_settings:
  callbacks: ["prometheus"]
--- a/litellm/types/integrations/prometheus.py
+++ b/litellm/types/integrations/prometheus.py
@ -1,3 +1,5 @@
 from enum import Enum
 REQUESTED_MODEL = "requested_model"
 EXCEPTION_STATUS = "exception_status"
 EXCEPTION_CLASS = "exception_class"
@ -41,3 +43,14 @@ LATENCY_BUCKETS = (
    300.0,
    float("inf"),
 )
 class UserAPIKeyLabelNames(Enum):
    END_USER = "end_user"
    USER = "user"
    API_KEY_HASH = "hashed_api_key"
    API_KEY_ALIAS = "api_key_alias"
    TEAM = "team"
    TEAM_ALIAS = "team_alias"
    REQUESTED_MODEL = REQUESTED_MODEL
    LITELLM_MODEL = "model"
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -1506,6 +1506,7 @@ class StandardLoggingPayload(TypedDict):
    id: str
    trace_id: str  # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries)
    call_type: str
    stream: Optional[bool]
    response_cost: float
    response_cost_failure_debug_info: Optional[
        StandardLoggingModelCostFailureDebugInformation
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -274,7 +274,7 @@ def test_vertex_ai_anthropic_streaming():
 # )
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
-async def test_vertex_ai_anthropic_async():
+async def test_aavertex_ai_anthropic_async():
    # load_vertex_ai_credentials()
    try:
--- a/tests/logging_callback_tests/test_prometheus_unit_tests.py
+++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py
@ -46,6 +46,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
    return StandardLoggingPayload(
        id="test_id",
        call_type="completion",
        stream=False,
        response_cost=0.1,
        response_cost_failure_debug_info=None,
        status="success",
@ -72,6 +73,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
            spend_logs_metadata=None,
            requester_ip_address="127.0.0.1",
            requester_metadata=None,
            user_api_key_end_user_id="test_end_user",
        ),
        cache_hit=False,
        cache_key=None,
@ -110,6 +112,7 @@ async def test_async_log_success_event(prometheus_logger):
                "user_api_key": "test_key",
                "user_api_key_user_id": "test_user",
                "user_api_key_team_id": "test_team",
                "user_api_key_end_user_id": "test_end_user",
            }
        },
        "start_time": datetime.now(),
@ -299,7 +302,14 @@ def test_set_latency_metrics(prometheus_logger):
    # end_time - api_call_start_time
    prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with(
-        "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
+        model="gpt-3.5-turbo",
        hashed_api_key="key1",
        api_key_alias="alias1",
        team="team1",
        team_alias="team_alias1",
        user="test_user",
        end_user="test_end_user",
        requested_model="openai-gpt",
    )
    prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with(
        1.5
@ -307,7 +317,14 @@ def test_set_latency_metrics(prometheus_logger):
    # total latency for the request
    prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with(
-        "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
+        end_user="test_end_user",
        hashed_api_key="key1",
        api_key_alias="alias1",
        requested_model="openai-gpt",
        team="team1",
        team_alias="team_alias1",
        user="test_user",
        model="gpt-3.5-turbo",
    )
    prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with(
        2.0
--- a/tests/otel_tests/test_prometheus.py
+++ b/tests/otel_tests/test_prometheus.py
@ -145,12 +145,12 @@ async def test_proxy_success_metrics():
        # Check if the success metric is present and correct
        assert (
-            'litellm_request_total_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
+            'litellm_request_total_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
            in metrics
        )
        assert (
-            'litellm_llm_api_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
+            'litellm_llm_api_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
            in metrics
        )