(fix) PrometheusServicesLogger _get_metric should return metric in Registry (#6486)

* fix logging DB fails on prometheus

* unit testing log to otel wrapper

* unit testing for service logger + prometheus

* use LATENCY buckets for service logging

* fix service logging

* fix _get_metric in prom services logger

* add clear doc string

* unit testing for prom service logger
This commit is contained in:
Ishaan Jaff 2024-10-29 21:29:19 +05:30 committed by GitHub
parent 8e19a31d36
commit f05bdd4074
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 100 additions and 11 deletions

View file

@ -81,18 +81,17 @@ class PrometheusServicesLogger:
return True return True
return False return False
def get_metric(self, metric_name): def _get_metric(self, metric_name):
for metric in self.REGISTRY.collect(): """
for sample in metric.samples: Helper function to get a metric from the registry by name.
if metric_name == sample.name: """
return metric return self.REGISTRY._names_to_collectors.get(metric_name)
return None
def create_histogram(self, service: str, type_of_request: str): def create_histogram(self, service: str, type_of_request: str):
metric_name = "litellm_{}_{}".format(service, type_of_request) metric_name = "litellm_{}_{}".format(service, type_of_request)
is_registered = self.is_metric_registered(metric_name) is_registered = self.is_metric_registered(metric_name)
if is_registered: if is_registered:
return self.get_metric(metric_name) return self._get_metric(metric_name)
return self.Histogram( return self.Histogram(
metric_name, metric_name,
"Latency for {} service".format(service), "Latency for {} service".format(service),
@ -104,7 +103,7 @@ class PrometheusServicesLogger:
metric_name = "litellm_{}_{}".format(service, type_of_request) metric_name = "litellm_{}_{}".format(service, type_of_request)
is_registered = self.is_metric_registered(metric_name) is_registered = self.is_metric_registered(metric_name)
if is_registered: if is_registered:
return self.get_metric(metric_name) return self._get_metric(metric_name)
return self.Counter( return self.Counter(
metric_name, metric_name,
"Total {} for {} service".format(type_of_request, service), "Total {} for {} service".format(type_of_request, service),

View file

@ -8,3 +8,4 @@ model_list:
litellm_settings: litellm_settings:
callbacks: ["prometheus"] callbacks: ["prometheus"]
service_callback: ["prometheus_system"] service_callback: ["prometheus_system"]
cache: true

View file

@ -138,6 +138,12 @@ def safe_deep_copy(data):
def log_to_opentelemetry(func): def log_to_opentelemetry(func):
"""
Decorator to log the duration of a DB related function to ServiceLogger()
Handles logging DB success/failure to ServiceLogger(), which logs to Prometheus, OTEL, Datadog
"""
@wraps(func) @wraps(func)
async def wrapper(*args, **kwargs): async def wrapper(*args, **kwargs):
start_time: datetime = datetime.now() start_time: datetime = datetime.now()
@ -145,10 +151,8 @@ def log_to_opentelemetry(func):
try: try:
result = await func(*args, **kwargs) result = await func(*args, **kwargs)
end_time: datetime = datetime.now() end_time: datetime = datetime.now()
from litellm.proxy.proxy_server import proxy_logging_obj from litellm.proxy.proxy_server import proxy_logging_obj
# Log to OTEL only if "parent_otel_span" is in kwargs and is not None
if "PROXY" not in func.__name__: if "PROXY" not in func.__name__:
await proxy_logging_obj.service_logging_obj.async_service_success_hook( await proxy_logging_obj.service_logging_obj.async_service_success_hook(
service=ServiceTypes.DB, service=ServiceTypes.DB,

View file

@ -210,3 +210,88 @@ async def test_service_logger_db_monitoring_failure():
assert actual_payload.call_type == "query" assert actual_payload.call_type == "query"
assert actual_payload.is_error is True assert actual_payload.is_error is True
assert actual_payload.error == "Database connection failed" assert actual_payload.error == "Database connection failed"
def test_get_metric_existing():
"""Test _get_metric when metric exists. _get_metric should return the metric object"""
pl = PrometheusServicesLogger()
# Create a metric first
hist = pl.create_histogram(
service="test_service", type_of_request="test_type_of_request"
)
# Test retrieving existing metric
retrieved_metric = pl._get_metric("litellm_test_service_test_type_of_request")
assert retrieved_metric is hist
assert retrieved_metric is not None
def test_get_metric_non_existing():
"""Test _get_metric when metric doesn't exist, returns None"""
pl = PrometheusServicesLogger()
# Test retrieving non-existent metric
non_existent = pl._get_metric("non_existent_metric")
assert non_existent is None
def test_create_histogram_new():
"""Test creating a new histogram"""
pl = PrometheusServicesLogger()
# Create new histogram
hist = pl.create_histogram(
service="test_service", type_of_request="test_type_of_request"
)
assert hist is not None
assert pl._get_metric("litellm_test_service_test_type_of_request") is hist
def test_create_histogram_existing():
"""Test creating a histogram that already exists"""
pl = PrometheusServicesLogger()
# Create initial histogram
hist1 = pl.create_histogram(
service="test_service", type_of_request="test_type_of_request"
)
# Create same histogram again
hist2 = pl.create_histogram(
service="test_service", type_of_request="test_type_of_request"
)
assert hist2 is hist1 # same object
assert pl._get_metric("litellm_test_service_test_type_of_request") is hist1
def test_create_counter_new():
"""Test creating a new counter"""
pl = PrometheusServicesLogger()
# Create new counter
counter = pl.create_counter(
service="test_service", type_of_request="test_type_of_request"
)
assert counter is not None
assert pl._get_metric("litellm_test_service_test_type_of_request") is counter
def test_create_counter_existing():
"""Test creating a counter that already exists"""
pl = PrometheusServicesLogger()
# Create initial counter
counter1 = pl.create_counter(
service="test_service", type_of_request="test_type_of_request"
)
# Create same counter again
counter2 = pl.create_counter(
service="test_service", type_of_request="test_type_of_request"
)
assert counter2 is counter1
assert pl._get_metric("litellm_test_service_test_type_of_request") is counter1