mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
feat - track latency per llm deployment
This commit is contained in:
parent
83446926af
commit
01f5456e3d
1 changed files with 52 additions and 6 deletions
|
@ -1,13 +1,12 @@
|
|||
# used for /metrics endpoint on LiteLLM Proxy
|
||||
#### What this does ####
|
||||
# On success, log events to Prometheus
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, TypedDict, Union
|
||||
|
||||
import dotenv
|
||||
|
@ -25,7 +24,7 @@ class PrometheusLogger(CustomLogger):
|
|||
**kwargs,
|
||||
):
|
||||
try:
|
||||
from prometheus_client import Counter, Gauge
|
||||
from prometheus_client import Counter, Gauge, Histogram
|
||||
|
||||
from litellm.proxy.proxy_server import premium_user
|
||||
|
||||
|
@ -164,6 +163,13 @@ class PrometheusLogger(CustomLogger):
|
|||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
|
||||
# Deployment Latency tracking
|
||||
self.llm_deployment_latency_per_output_token = Histogram(
|
||||
name="llm_deployment_latency_per_output_token",
|
||||
documentation="LLM Deployment Analytics - Latency per output token",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
||||
raise e
|
||||
|
@ -213,9 +219,10 @@ class PrometheusLogger(CustomLogger):
|
|||
_remaining_api_key_budget = safe_get_remaining_budget(
|
||||
max_budget=_api_key_max_budget, spend=_api_key_spend
|
||||
)
|
||||
|
||||
output_tokens = 1.0
|
||||
if response_obj is not None:
|
||||
tokens_used = response_obj.get("usage", {}).get("total_tokens", 0)
|
||||
output_tokens = response_obj.get("usage", {}).get("completion_tokens", 0)
|
||||
else:
|
||||
tokens_used = 0
|
||||
|
||||
|
@ -270,7 +277,9 @@ class PrometheusLogger(CustomLogger):
|
|||
|
||||
# set x-ratelimit headers
|
||||
if premium_user is True:
|
||||
self.set_llm_deployment_success_metrics(kwargs)
|
||||
self.set_llm_deployment_success_metrics(
|
||||
kwargs, start_time, end_time, output_tokens
|
||||
)
|
||||
pass
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
|
@ -356,7 +365,13 @@ class PrometheusLogger(CustomLogger):
|
|||
except:
|
||||
pass
|
||||
|
||||
def set_llm_deployment_success_metrics(self, request_kwargs: dict):
|
||||
def set_llm_deployment_success_metrics(
|
||||
self,
|
||||
request_kwargs: dict,
|
||||
start_time,
|
||||
end_time,
|
||||
output_tokens: float = 1.0,
|
||||
):
|
||||
try:
|
||||
verbose_logger.debug("setting remaining tokens requests metric")
|
||||
_response_headers = request_kwargs.get("response_headers")
|
||||
|
@ -425,6 +440,37 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base=api_base,
|
||||
api_provider=llm_provider,
|
||||
).inc()
|
||||
|
||||
# Track deployment Latency
|
||||
response_ms: timedelta = end_time - start_time
|
||||
time_to_first_token_response_time: Optional[timedelta] = None
|
||||
|
||||
if (
|
||||
request_kwargs.get("stream", None) is not None
|
||||
and request_kwargs["stream"] == True
|
||||
):
|
||||
# only log ttft for streaming request
|
||||
time_to_first_token_response_time = (
|
||||
request_kwargs.get("completion_start_time", end_time) - start_time
|
||||
)
|
||||
|
||||
# use the metric that is not None
|
||||
# if streaming - use time_to_first_token_response
|
||||
# if not streaming - use response_ms
|
||||
_latency: timedelta = time_to_first_token_response_time or response_ms
|
||||
_latency_seconds = _latency.total_seconds()
|
||||
|
||||
# latency per output token
|
||||
latency_per_token = None
|
||||
if output_tokens is not None and output_tokens > 0:
|
||||
latency_per_token = _latency_seconds / output_tokens
|
||||
self.llm_deployment_latency_per_output_token.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
api_provider=llm_provider,
|
||||
).observe(latency_per_token)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
"Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue