forked from phoenix/litellm-mirror
Merge pull request #5335 from BerriAI/litellm_add_metrics_latency
[Feat-Proxy] Prometheus Metrics to Track request latency, track llm api latency
This commit is contained in:
commit
8dbcdafe4b
4 changed files with 52 additions and 1 deletions
|
@ -68,6 +68,15 @@ http://localhost:4000/metrics
|
||||||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
||||||
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
||||||
|
|
||||||
|
### Request Latency Metrics
|
||||||
|
|
||||||
|
| Metric Name | Description |
|
||||||
|
|----------------------|--------------------------------------|
|
||||||
|
| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
|
||||||
|
| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### LLM API / Provider Metrics
|
### LLM API / Provider Metrics
|
||||||
|
|
||||||
| Metric Name | Description |
|
| Metric Name | Description |
|
||||||
|
|
|
@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# request latency metrics
|
||||||
|
self.litellm_request_total_latency_metric = Histogram(
|
||||||
|
"litellm_request_total_latency_metric",
|
||||||
|
"Total latency (seconds) for a request to LiteLLM",
|
||||||
|
labelnames=[
|
||||||
|
"model",
|
||||||
|
"litellm_call_id",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.litellm_llm_api_latency_metric = Histogram(
|
||||||
|
"litellm_llm_api_latency_metric",
|
||||||
|
"Total latency (seconds) for a models LLM API call",
|
||||||
|
labelnames=[
|
||||||
|
"model",
|
||||||
|
"litellm_call_id",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# Counter for spend
|
# Counter for spend
|
||||||
self.litellm_spend_metric = Counter(
|
self.litellm_spend_metric = Counter(
|
||||||
"litellm_spend_metric",
|
"litellm_spend_metric",
|
||||||
|
@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
|
||||||
user_api_key, user_api_key_alias, model_group
|
user_api_key, user_api_key_alias, model_group
|
||||||
).set(remaining_tokens)
|
).set(remaining_tokens)
|
||||||
|
|
||||||
|
# latency metrics
|
||||||
|
total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
|
||||||
|
total_time_seconds = total_time.total_seconds()
|
||||||
|
api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
|
||||||
|
"api_call_start_time"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_call_total_time_seconds = api_call_total_time.total_seconds()
|
||||||
|
|
||||||
|
litellm_call_id = kwargs.get("litellm_call_id")
|
||||||
|
|
||||||
|
self.litellm_request_total_latency_metric.labels(
|
||||||
|
model, litellm_call_id
|
||||||
|
).observe(total_time_seconds)
|
||||||
|
|
||||||
|
self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
|
||||||
|
api_call_total_time_seconds
|
||||||
|
)
|
||||||
|
|
||||||
# set x-ratelimit headers
|
# set x-ratelimit headers
|
||||||
if premium_user is True:
|
if premium_user is True:
|
||||||
self.set_llm_deployment_success_metrics(
|
self.set_llm_deployment_success_metrics(
|
||||||
|
|
|
@ -354,6 +354,8 @@ class Logging:
|
||||||
str(e)
|
str(e)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.model_call_details["api_call_start_time"] = datetime.datetime.now()
|
||||||
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
||||||
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
|
|
|
@ -4,7 +4,9 @@ model_list:
|
||||||
model: openai/fake
|
model: openai/fake
|
||||||
api_key: fake-key
|
api_key: fake-key
|
||||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
litellm_settings:
|
||||||
|
success_callback: ["prometheus"]
|
||||||
|
failure_callback: ["prometheus"]
|
||||||
guardrails:
|
guardrails:
|
||||||
- guardrail_name: "lakera-pre-guard"
|
- guardrail_name: "lakera-pre-guard"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue