forked from phoenix/litellm-mirror
* fix prometheus have well defined latency buckets * use a well define latency bucket * use types file for prometheus logging * add test for LATENCY_BUCKETS
231 lines
9.1 KiB
Python
231 lines
9.1 KiB
Python
"""
|
|
Unit tests for prometheus metrics
|
|
"""
|
|
|
|
import pytest
|
|
import aiohttp
|
|
import asyncio
|
|
import uuid
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(
|
|
0, os.path.abspath("../..")
|
|
) # Adds the parent directory to the system path
|
|
|
|
|
|
async def make_bad_chat_completion_request(session, key):
|
|
url = "http://0.0.0.0:4000/chat/completions"
|
|
headers = {
|
|
"Authorization": f"Bearer {key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
data = {
|
|
"model": "fake-azure-endpoint",
|
|
"messages": [{"role": "user", "content": "Hello"}],
|
|
}
|
|
async with session.post(url, headers=headers, json=data) as response:
|
|
status = response.status
|
|
response_text = await response.text()
|
|
return status, response_text
|
|
|
|
|
|
async def make_good_chat_completion_request(session, key):
|
|
url = "http://0.0.0.0:4000/chat/completions"
|
|
headers = {
|
|
"Authorization": f"Bearer {key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
data = {
|
|
"model": "fake-openai-endpoint",
|
|
"messages": [{"role": "user", "content": f"Hello {uuid.uuid4()}"}],
|
|
"tags": ["teamB"],
|
|
}
|
|
async with session.post(url, headers=headers, json=data) as response:
|
|
status = response.status
|
|
response_text = await response.text()
|
|
return status, response_text
|
|
|
|
|
|
async def make_chat_completion_request_with_fallback(session, key):
|
|
url = "http://0.0.0.0:4000/chat/completions"
|
|
headers = {
|
|
"Authorization": f"Bearer {key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
data = {
|
|
"model": "fake-azure-endpoint",
|
|
"messages": [{"role": "user", "content": "Hello"}],
|
|
"fallbacks": ["fake-openai-endpoint"],
|
|
}
|
|
async with session.post(url, headers=headers, json=data) as response:
|
|
status = response.status
|
|
response_text = await response.text()
|
|
|
|
# make a request with a failed fallback
|
|
data = {
|
|
"model": "fake-azure-endpoint",
|
|
"messages": [{"role": "user", "content": "Hello"}],
|
|
"fallbacks": ["unknown-model"],
|
|
}
|
|
|
|
async with session.post(url, headers=headers, json=data) as response:
|
|
status = response.status
|
|
response_text = await response.text()
|
|
|
|
return
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_proxy_failure_metrics():
|
|
"""
|
|
- Make 1 bad chat completion call to "fake-azure-endpoint"
|
|
- GET /metrics
|
|
- assert the failure metric for the requested model is incremented by 1
|
|
- Assert the Exception class and status code are correct
|
|
"""
|
|
async with aiohttp.ClientSession() as session:
|
|
# Make a bad chat completion call
|
|
status, response_text = await make_bad_chat_completion_request(
|
|
session, "sk-1234"
|
|
)
|
|
|
|
# Check if the request failed as expected
|
|
assert status == 429, f"Expected status 429, but got {status}"
|
|
|
|
# Get metrics
|
|
async with session.get("http://0.0.0.0:4000/metrics") as response:
|
|
metrics = await response.text()
|
|
|
|
print("/metrics", metrics)
|
|
|
|
# Check if the failure metric is present and correct
|
|
expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'
|
|
|
|
assert (
|
|
expected_metric in metrics
|
|
), "Expected failure metric not found in /metrics"
|
|
expected_llm_deployment_failure = 'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_provider="openai",exception_class="RateLimitError",exception_status="429",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint"} 1.0'
|
|
assert expected_llm_deployment_failure
|
|
|
|
assert (
|
|
'litellm_proxy_total_requests_metric_total{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0'
|
|
in metrics
|
|
)
|
|
|
|
assert (
|
|
'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}'
|
|
in metrics
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_proxy_success_metrics():
|
|
"""
|
|
Make 1 good /chat/completions call to "openai/gpt-3.5-turbo"
|
|
GET /metrics
|
|
Assert the success metric is incremented by 1
|
|
"""
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
# Make a good chat completion call
|
|
status, response_text = await make_good_chat_completion_request(
|
|
session, "sk-1234"
|
|
)
|
|
|
|
# Check if the request succeeded as expected
|
|
assert status == 200, f"Expected status 200, but got {status}"
|
|
|
|
# Get metrics
|
|
async with session.get("http://0.0.0.0:4000/metrics") as response:
|
|
metrics = await response.text()
|
|
|
|
print("/metrics", metrics)
|
|
|
|
# Check if the success metric is present and correct
|
|
assert (
|
|
'litellm_request_total_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
|
|
in metrics
|
|
)
|
|
|
|
assert (
|
|
'litellm_llm_api_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
|
|
in metrics
|
|
)
|
|
|
|
# assert (
|
|
# 'litellm_deployment_latency_per_output_token_count{api_base="https://exampleopenaiendpoint-production.up.railway.app/",api_key_alias="None",api_provider="openai",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="fake",model_id="team-b-model",team="None",team_alias="None"}'
|
|
# in metrics
|
|
# )
|
|
|
|
verify_latency_metrics(metrics)
|
|
|
|
|
|
def verify_latency_metrics(metrics: str):
|
|
"""
|
|
Assert that LATENCY_BUCKETS distribution is used for
|
|
- litellm_request_total_latency_metric_bucket
|
|
- litellm_llm_api_latency_metric_bucket
|
|
"""
|
|
from litellm.types.integrations.prometheus import LATENCY_BUCKETS
|
|
import re
|
|
|
|
metric_names = [
|
|
"litellm_request_total_latency_metric_bucket",
|
|
"litellm_llm_api_latency_metric_bucket",
|
|
]
|
|
|
|
for metric_name in metric_names:
|
|
# Extract all 'le' values for the current metric
|
|
pattern = rf'{metric_name}{{.*?le="(.*?)".*?}}'
|
|
le_values = re.findall(pattern, metrics)
|
|
|
|
# Convert to set for easier comparison
|
|
actual_buckets = set(le_values)
|
|
|
|
print("actual_buckets", actual_buckets)
|
|
expected_buckets = []
|
|
for bucket in LATENCY_BUCKETS:
|
|
expected_buckets.append(str(bucket))
|
|
|
|
# replace inf with +Inf
|
|
expected_buckets = [
|
|
bucket.replace("inf", "+Inf") for bucket in expected_buckets
|
|
]
|
|
|
|
print("expected_buckets", expected_buckets)
|
|
expected_buckets = set(expected_buckets)
|
|
# Verify all expected buckets are present
|
|
assert (
|
|
actual_buckets == expected_buckets
|
|
), f"Mismatch in {metric_name} buckets. Expected: {expected_buckets}, Got: {actual_buckets}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_proxy_fallback_metrics():
|
|
"""
|
|
Make 1 request with a client side fallback - check metrics
|
|
"""
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
# Make a good chat completion call
|
|
await make_chat_completion_request_with_fallback(session, "sk-1234")
|
|
|
|
# Get metrics
|
|
async with session.get("http://0.0.0.0:4000/metrics") as response:
|
|
metrics = await response.text()
|
|
|
|
print("/metrics", metrics)
|
|
|
|
# Check if successful fallback metric is incremented
|
|
assert (
|
|
'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
|
in metrics
|
|
)
|
|
|
|
# Check if failed fallback metric is incremented
|
|
assert (
|
|
'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0'
|
|
in metrics
|
|
)
|