feat - track latency per llm deployment

This commit is contained in:
Ishaan Jaff 2024-08-10 12:53:56 -07:00
parent 83446926af
commit 01f5456e3d

View file

@ -1,13 +1,12 @@
# used for /metrics endpoint on LiteLLM Proxy
#### What this does ####
# On success, log events to Prometheus
import datetime
import os
import subprocess
import sys
import traceback
import uuid
from datetime import datetime, timedelta
from typing import Optional, TypedDict, Union
import dotenv
@ -25,7 +24,7 @@ class PrometheusLogger(CustomLogger):
**kwargs,
):
try:
from prometheus_client import Counter, Gauge
from prometheus_client import Counter, Gauge, Histogram
from litellm.proxy.proxy_server import premium_user
@ -164,6 +163,13 @@ class PrometheusLogger(CustomLogger):
labelnames=_logged_llm_labels,
)
# Deployment Latency tracking
self.llm_deployment_latency_per_output_token = Histogram(
name="llm_deployment_latency_per_output_token",
documentation="LLM Deployment Analytics - Latency per output token",
labelnames=_logged_llm_labels,
)
except Exception as e:
print_verbose(f"Got exception on init prometheus client {str(e)}")
raise e
@ -213,9 +219,10 @@ class PrometheusLogger(CustomLogger):
_remaining_api_key_budget = safe_get_remaining_budget(
max_budget=_api_key_max_budget, spend=_api_key_spend
)
output_tokens = 1.0
if response_obj is not None:
tokens_used = response_obj.get("usage", {}).get("total_tokens", 0)
output_tokens = response_obj.get("usage", {}).get("completion_tokens", 0)
else:
tokens_used = 0
@ -270,7 +277,9 @@ class PrometheusLogger(CustomLogger):
# set x-ratelimit headers
if premium_user is True:
self.set_llm_deployment_success_metrics(kwargs)
self.set_llm_deployment_success_metrics(
kwargs, start_time, end_time, output_tokens
)
pass
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
@ -356,7 +365,13 @@ class PrometheusLogger(CustomLogger):
except:
pass
def set_llm_deployment_success_metrics(self, request_kwargs: dict):
def set_llm_deployment_success_metrics(
self,
request_kwargs: dict,
start_time,
end_time,
output_tokens: float = 1.0,
):
try:
verbose_logger.debug("setting remaining tokens requests metric")
_response_headers = request_kwargs.get("response_headers")
@ -425,6 +440,37 @@ class PrometheusLogger(CustomLogger):
api_base=api_base,
api_provider=llm_provider,
).inc()
# Track deployment Latency
response_ms: timedelta = end_time - start_time
time_to_first_token_response_time: Optional[timedelta] = None
if (
request_kwargs.get("stream", None) is not None
and request_kwargs["stream"] == True
):
# only log ttft for streaming request
time_to_first_token_response_time = (
request_kwargs.get("completion_start_time", end_time) - start_time
)
# use the metric that is not None
# if streaming - use time_to_first_token_response
# if not streaming - use response_ms
_latency: timedelta = time_to_first_token_response_time or response_ms
_latency_seconds = _latency.total_seconds()
# latency per output token
latency_per_token = None
if output_tokens is not None and output_tokens > 0:
latency_per_token = _latency_seconds / output_tokens
self.llm_deployment_latency_per_output_token.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
api_provider=llm_provider,
).observe(latency_per_token)
except Exception as e:
verbose_logger.error(
"Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format(