diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 4b913d2e8..10e6456c2 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -68,6 +68,15 @@ http://localhost:4000/metrics | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | | `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | +### Request Latency Metrics + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` | +| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` | + + + ### LLM API / Provider Metrics | Metric Name | Description | diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 1471f59b7..659e5b193 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger): ], ) + # request latency metrics + self.litellm_request_total_latency_metric = Histogram( + "litellm_request_total_latency_metric", + "Total latency (seconds) for a request to LiteLLM", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + + self.litellm_llm_api_latency_metric = Histogram( + "litellm_llm_api_latency_metric", + "Total latency (seconds) for a models LLM API call", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + # Counter for spend self.litellm_spend_metric = Counter( "litellm_spend_metric", @@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger): user_api_key, user_api_key_alias, model_group ).set(remaining_tokens) + # latency metrics + total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time") + total_time_seconds = total_time.total_seconds() + api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get( + "api_call_start_time" + ) + + api_call_total_time_seconds = api_call_total_time.total_seconds() + + litellm_call_id = kwargs.get("litellm_call_id") + + self.litellm_request_total_latency_metric.labels( + model, litellm_call_id + ).observe(total_time_seconds) + + self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe( + api_call_total_time_seconds + ) + # set x-ratelimit headers if premium_user is True: self.set_llm_deployment_success_metrics( diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index d59f98558..dbf2a7d3e 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -354,6 +354,8 @@ class Logging: str(e) ) ) + + self.model_call_details["api_call_start_time"] = datetime.datetime.now() # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made callbacks = litellm.input_callback + self.dynamic_input_callbacks for callback in callbacks: diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 65c7f7052..7c524eb18 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -4,7 +4,9 @@ model_list: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - +litellm_settings: + success_callback: ["prometheus"] + failure_callback: ["prometheus"] guardrails: - guardrail_name: "lakera-pre-guard" litellm_params: