(prometheus - minor bug fix) - litellm_llm_api_time_to_first_token_metric not populating for bedrock models (#7740)

* fix prometheus ttft * fix test_set_latency_metrics * fix _set_latency_metrics * fix _set_latency_metrics * fix test_set_latency_metrics * test_async_log_success_event * huggingface/mistralai/Mistral-7B-Instruct-v0.3
2025-04-26 11:14:04 +00:00 · 2025-01-13 20:16:34 -08:00 · 2025-01-13 20:16:34 -08:00 · 9daa6fb0b4
commit 9daa6fb0b4
parent d88f01d518
2 changed files with 3 additions and 9 deletions
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -449,7 +449,6 @@ class PrometheusLogger(CustomLogger):
            # why type ignore below?
            # 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains.
            # 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal
            standard_logging_payload=standard_logging_payload,  # type: ignore
            enum_values=enum_values,
        )
@ -626,22 +625,17 @@ class PrometheusLogger(CustomLogger):
        user_api_key_alias: Optional[str],
        user_api_team: Optional[str],
        user_api_team_alias: Optional[str],
        standard_logging_payload: StandardLoggingPayload,
        enum_values: UserAPIKeyLabelValues,
    ):
        # latency metrics
        model_parameters: dict = standard_logging_payload["model_parameters"]
        end_time: datetime = kwargs.get("end_time") or datetime.now()
        start_time: Optional[datetime] = kwargs.get("start_time")
        api_call_start_time = kwargs.get("api_call_start_time", None)
        completion_start_time = kwargs.get("completion_start_time", None)
        if (
            completion_start_time is not None
            and isinstance(completion_start_time, datetime)
-            and model_parameters.get("stream")
+            and kwargs.get("stream", False) is True  # only emit for streaming requests
            is True  # only emit for streaming requests
        ):
            time_to_first_token_seconds = (
                completion_start_time - api_call_start_time
--- a/tests/logging_callback_tests/test_prometheus_unit_tests.py
+++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py
@ -112,6 +112,7 @@ async def test_async_log_success_event(prometheus_logger):
    standard_logging_object = create_standard_logging_payload()
    kwargs = {
        "model": "gpt-3.5-turbo",
        "stream": True,
        "litellm_params": {
            "metadata": {
                "user_api_key": "test_key",
@ -298,7 +299,6 @@ def test_set_latency_metrics(prometheus_logger):
    time to first token, llm api latency, and request total latency metrics are set to the values in the standard logging payload
    """
    standard_logging_payload = create_standard_logging_payload()
    standard_logging_payload["model_parameters"] = {"stream": True}
    prometheus_logger.litellm_llm_api_time_to_first_token_metric = MagicMock()
    prometheus_logger.litellm_llm_api_latency_metric = MagicMock()
    prometheus_logger.litellm_request_total_latency_metric = MagicMock()
@ -322,6 +322,7 @@ def test_set_latency_metrics(prometheus_logger):
        "api_call_start_time": now - timedelta(seconds=1.5),  # when the api call starts
        "completion_start_time": now
        - timedelta(seconds=1),  # when the completion starts
        "stream": True,
    }
    prometheus_logger._set_latency_metrics(
@ -331,7 +332,6 @@ def test_set_latency_metrics(prometheus_logger):
        user_api_key_alias="alias1",
        user_api_team="team1",
        user_api_team_alias="team_alias1",
        standard_logging_payload=standard_logging_payload,
        enum_values=enum_values,
    )