mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
Add 'end_user', 'user' and 'requested_model' on more prometheus metrics (#7399)
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 11s
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 11s
* fix(prometheus.py): support streaming end user litellm_proxy_total_requests_metric tracking * fix(prometheus.py): add 'requested_model' and 'end_user_id' to 'litellm_request_total_latency_metric_bucket' enables latency tracking by end user + requested model * fix(prometheus.py): add end user, user and requested model metrics to 'litellm_llm_api_latency_metric' * test: update prometheus unit tests * test(test_prometheus.py): update tests * test(test_prometheus.py): fix test * test: reorder test
This commit is contained in:
parent
bd4ab1449e
commit
78fe124c14
8 changed files with 114 additions and 31 deletions
|
@ -69,11 +69,14 @@ class PrometheusLogger(CustomLogger):
|
||||||
"litellm_request_total_latency_metric",
|
"litellm_request_total_latency_metric",
|
||||||
"Total latency (seconds) for a request to LiteLLM",
|
"Total latency (seconds) for a request to LiteLLM",
|
||||||
labelnames=[
|
labelnames=[
|
||||||
"model",
|
UserAPIKeyLabelNames.END_USER.value,
|
||||||
"hashed_api_key",
|
UserAPIKeyLabelNames.API_KEY_HASH.value,
|
||||||
"api_key_alias",
|
UserAPIKeyLabelNames.API_KEY_ALIAS.value,
|
||||||
"team",
|
REQUESTED_MODEL,
|
||||||
"team_alias",
|
UserAPIKeyLabelNames.TEAM.value,
|
||||||
|
UserAPIKeyLabelNames.TEAM_ALIAS.value,
|
||||||
|
UserAPIKeyLabelNames.USER.value,
|
||||||
|
UserAPIKeyLabelNames.LITELLM_MODEL.value,
|
||||||
],
|
],
|
||||||
buckets=LATENCY_BUCKETS,
|
buckets=LATENCY_BUCKETS,
|
||||||
)
|
)
|
||||||
|
@ -82,11 +85,14 @@ class PrometheusLogger(CustomLogger):
|
||||||
"litellm_llm_api_latency_metric",
|
"litellm_llm_api_latency_metric",
|
||||||
"Total latency (seconds) for a models LLM API call",
|
"Total latency (seconds) for a models LLM API call",
|
||||||
labelnames=[
|
labelnames=[
|
||||||
"model",
|
UserAPIKeyLabelNames.LITELLM_MODEL.value,
|
||||||
"hashed_api_key",
|
UserAPIKeyLabelNames.API_KEY_HASH.value,
|
||||||
"api_key_alias",
|
UserAPIKeyLabelNames.API_KEY_ALIAS.value,
|
||||||
"team",
|
UserAPIKeyLabelNames.TEAM.value,
|
||||||
"team_alias",
|
UserAPIKeyLabelNames.TEAM_ALIAS.value,
|
||||||
|
UserAPIKeyLabelNames.REQUESTED_MODEL.value,
|
||||||
|
UserAPIKeyLabelNames.END_USER.value,
|
||||||
|
UserAPIKeyLabelNames.USER.value,
|
||||||
],
|
],
|
||||||
buckets=LATENCY_BUCKETS,
|
buckets=LATENCY_BUCKETS,
|
||||||
)
|
)
|
||||||
|
@ -447,7 +453,20 @@ class PrometheusLogger(CustomLogger):
|
||||||
self.set_llm_deployment_success_metrics(
|
self.set_llm_deployment_success_metrics(
|
||||||
kwargs, start_time, end_time, output_tokens
|
kwargs, start_time, end_time, output_tokens
|
||||||
)
|
)
|
||||||
pass
|
|
||||||
|
if (
|
||||||
|
standard_logging_payload["stream"] is True
|
||||||
|
): # log successful streaming requests from logging event hook.
|
||||||
|
self.litellm_proxy_total_requests_metric.labels(
|
||||||
|
end_user=end_user_id,
|
||||||
|
hashed_api_key=user_api_key,
|
||||||
|
api_key_alias=user_api_key_alias,
|
||||||
|
requested_model=model,
|
||||||
|
team=user_api_team,
|
||||||
|
team_alias=user_api_team_alias,
|
||||||
|
user=user_id,
|
||||||
|
status_code="200",
|
||||||
|
).inc()
|
||||||
|
|
||||||
def _increment_token_metrics(
|
def _increment_token_metrics(
|
||||||
self,
|
self,
|
||||||
|
@ -631,23 +650,44 @@ class PrometheusLogger(CustomLogger):
|
||||||
api_call_total_time: timedelta = end_time - api_call_start_time
|
api_call_total_time: timedelta = end_time - api_call_start_time
|
||||||
api_call_total_time_seconds = api_call_total_time.total_seconds()
|
api_call_total_time_seconds = api_call_total_time.total_seconds()
|
||||||
self.litellm_llm_api_latency_metric.labels(
|
self.litellm_llm_api_latency_metric.labels(
|
||||||
model,
|
**{
|
||||||
user_api_key,
|
UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
|
||||||
user_api_key_alias,
|
UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
|
||||||
user_api_team,
|
UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
|
||||||
user_api_team_alias,
|
UserAPIKeyLabelNames.TEAM.value: user_api_team,
|
||||||
|
UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
|
||||||
|
UserAPIKeyLabelNames.USER.value: standard_logging_payload[
|
||||||
|
"metadata"
|
||||||
|
]["user_api_key_user_id"],
|
||||||
|
UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
|
||||||
|
"metadata"
|
||||||
|
]["user_api_key_end_user_id"],
|
||||||
|
UserAPIKeyLabelNames.REQUESTED_MODEL.value: standard_logging_payload[
|
||||||
|
"model_group"
|
||||||
|
],
|
||||||
|
}
|
||||||
).observe(api_call_total_time_seconds)
|
).observe(api_call_total_time_seconds)
|
||||||
|
|
||||||
# total request latency
|
# total request latency
|
||||||
if start_time is not None and isinstance(start_time, datetime):
|
if start_time is not None and isinstance(start_time, datetime):
|
||||||
total_time: timedelta = end_time - start_time
|
total_time: timedelta = end_time - start_time
|
||||||
total_time_seconds = total_time.total_seconds()
|
total_time_seconds = total_time.total_seconds()
|
||||||
|
|
||||||
self.litellm_request_total_latency_metric.labels(
|
self.litellm_request_total_latency_metric.labels(
|
||||||
model,
|
**{
|
||||||
user_api_key,
|
UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
|
||||||
user_api_key_alias,
|
"metadata"
|
||||||
user_api_team,
|
]["user_api_key_end_user_id"],
|
||||||
user_api_team_alias,
|
UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
|
||||||
|
UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
|
||||||
|
REQUESTED_MODEL: standard_logging_payload["model_group"],
|
||||||
|
UserAPIKeyLabelNames.TEAM.value: user_api_team,
|
||||||
|
UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
|
||||||
|
UserAPIKeyLabelNames.USER.value: standard_logging_payload[
|
||||||
|
"metadata"
|
||||||
|
]["user_api_key_user_id"],
|
||||||
|
UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
|
||||||
|
}
|
||||||
).observe(total_time_seconds)
|
).observe(total_time_seconds)
|
||||||
|
|
||||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
|
|
@ -2961,11 +2961,19 @@ def get_standard_logging_object_payload(
|
||||||
kwargs=kwargs,
|
kwargs=kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
stream: Optional[bool] = None
|
||||||
|
if (
|
||||||
|
kwargs.get("complete_streaming_response") is not None
|
||||||
|
or kwargs.get("async_complete_streaming_response") is not None
|
||||||
|
):
|
||||||
|
stream = True
|
||||||
|
|
||||||
payload: StandardLoggingPayload = StandardLoggingPayload(
|
payload: StandardLoggingPayload = StandardLoggingPayload(
|
||||||
id=str(id),
|
id=str(id),
|
||||||
trace_id=kwargs.get("litellm_trace_id"), # type: ignore
|
trace_id=kwargs.get("litellm_trace_id"), # type: ignore
|
||||||
call_type=call_type or "",
|
call_type=call_type or "",
|
||||||
cache_hit=cache_hit,
|
cache_hit=cache_hit,
|
||||||
|
stream=stream,
|
||||||
status=status,
|
status=status,
|
||||||
saved_cache_cost=saved_cache_cost,
|
saved_cache_cost=saved_cache_cost,
|
||||||
startTime=start_time_float,
|
startTime=start_time_float,
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: whisper
|
- model_name: openai/*
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: whisper-1
|
model: openai/*
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/gpt-3.5-turbo
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
model_info:
|
|
||||||
mode: audio_transcription
|
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
callbacks: ["prometheus"]
|
|
@ -1,3 +1,5 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
REQUESTED_MODEL = "requested_model"
|
REQUESTED_MODEL = "requested_model"
|
||||||
EXCEPTION_STATUS = "exception_status"
|
EXCEPTION_STATUS = "exception_status"
|
||||||
EXCEPTION_CLASS = "exception_class"
|
EXCEPTION_CLASS = "exception_class"
|
||||||
|
@ -41,3 +43,14 @@ LATENCY_BUCKETS = (
|
||||||
300.0,
|
300.0,
|
||||||
float("inf"),
|
float("inf"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UserAPIKeyLabelNames(Enum):
|
||||||
|
END_USER = "end_user"
|
||||||
|
USER = "user"
|
||||||
|
API_KEY_HASH = "hashed_api_key"
|
||||||
|
API_KEY_ALIAS = "api_key_alias"
|
||||||
|
TEAM = "team"
|
||||||
|
TEAM_ALIAS = "team_alias"
|
||||||
|
REQUESTED_MODEL = REQUESTED_MODEL
|
||||||
|
LITELLM_MODEL = "model"
|
||||||
|
|
|
@ -1506,6 +1506,7 @@ class StandardLoggingPayload(TypedDict):
|
||||||
id: str
|
id: str
|
||||||
trace_id: str # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries)
|
trace_id: str # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries)
|
||||||
call_type: str
|
call_type: str
|
||||||
|
stream: Optional[bool]
|
||||||
response_cost: float
|
response_cost: float
|
||||||
response_cost_failure_debug_info: Optional[
|
response_cost_failure_debug_info: Optional[
|
||||||
StandardLoggingModelCostFailureDebugInformation
|
StandardLoggingModelCostFailureDebugInformation
|
||||||
|
|
|
@ -274,7 +274,7 @@ def test_vertex_ai_anthropic_streaming():
|
||||||
# )
|
# )
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.flaky(retries=3, delay=1)
|
@pytest.mark.flaky(retries=3, delay=1)
|
||||||
async def test_vertex_ai_anthropic_async():
|
async def test_aavertex_ai_anthropic_async():
|
||||||
# load_vertex_ai_credentials()
|
# load_vertex_ai_credentials()
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
|
|
@ -46,6 +46,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
|
||||||
return StandardLoggingPayload(
|
return StandardLoggingPayload(
|
||||||
id="test_id",
|
id="test_id",
|
||||||
call_type="completion",
|
call_type="completion",
|
||||||
|
stream=False,
|
||||||
response_cost=0.1,
|
response_cost=0.1,
|
||||||
response_cost_failure_debug_info=None,
|
response_cost_failure_debug_info=None,
|
||||||
status="success",
|
status="success",
|
||||||
|
@ -72,6 +73,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
|
||||||
spend_logs_metadata=None,
|
spend_logs_metadata=None,
|
||||||
requester_ip_address="127.0.0.1",
|
requester_ip_address="127.0.0.1",
|
||||||
requester_metadata=None,
|
requester_metadata=None,
|
||||||
|
user_api_key_end_user_id="test_end_user",
|
||||||
),
|
),
|
||||||
cache_hit=False,
|
cache_hit=False,
|
||||||
cache_key=None,
|
cache_key=None,
|
||||||
|
@ -110,6 +112,7 @@ async def test_async_log_success_event(prometheus_logger):
|
||||||
"user_api_key": "test_key",
|
"user_api_key": "test_key",
|
||||||
"user_api_key_user_id": "test_user",
|
"user_api_key_user_id": "test_user",
|
||||||
"user_api_key_team_id": "test_team",
|
"user_api_key_team_id": "test_team",
|
||||||
|
"user_api_key_end_user_id": "test_end_user",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"start_time": datetime.now(),
|
"start_time": datetime.now(),
|
||||||
|
@ -299,7 +302,14 @@ def test_set_latency_metrics(prometheus_logger):
|
||||||
|
|
||||||
# end_time - api_call_start_time
|
# end_time - api_call_start_time
|
||||||
prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with(
|
prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with(
|
||||||
"gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
|
model="gpt-3.5-turbo",
|
||||||
|
hashed_api_key="key1",
|
||||||
|
api_key_alias="alias1",
|
||||||
|
team="team1",
|
||||||
|
team_alias="team_alias1",
|
||||||
|
user="test_user",
|
||||||
|
end_user="test_end_user",
|
||||||
|
requested_model="openai-gpt",
|
||||||
)
|
)
|
||||||
prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with(
|
prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with(
|
||||||
1.5
|
1.5
|
||||||
|
@ -307,7 +317,14 @@ def test_set_latency_metrics(prometheus_logger):
|
||||||
|
|
||||||
# total latency for the request
|
# total latency for the request
|
||||||
prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with(
|
prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with(
|
||||||
"gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
|
end_user="test_end_user",
|
||||||
|
hashed_api_key="key1",
|
||||||
|
api_key_alias="alias1",
|
||||||
|
requested_model="openai-gpt",
|
||||||
|
team="team1",
|
||||||
|
team_alias="team_alias1",
|
||||||
|
user="test_user",
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
)
|
)
|
||||||
prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with(
|
prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with(
|
||||||
2.0
|
2.0
|
||||||
|
|
|
@ -145,12 +145,12 @@ async def test_proxy_success_metrics():
|
||||||
|
|
||||||
# Check if the success metric is present and correct
|
# Check if the success metric is present and correct
|
||||||
assert (
|
assert (
|
||||||
'litellm_request_total_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
|
'litellm_request_total_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
|
||||||
in metrics
|
in metrics
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
'litellm_llm_api_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
|
'litellm_llm_api_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
|
||||||
in metrics
|
in metrics
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue