From f05bdd40741e7e9040f416ed1e9309c1b963a0e1 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 29 Oct 2024 21:29:19 +0530 Subject: [PATCH] (fix) `PrometheusServicesLogger` `_get_metric` should return metric in Registry (#6486) * fix logging DB fails on prometheus * unit testing log to otel wrapper * unit testing for service logger + prometheus * use LATENCY buckets for service logging * fix service logging * fix _get_metric in prom services logger * add clear doc string * unit testing for prom service logger --- litellm/integrations/prometheus_services.py | 15 ++-- litellm/proxy/proxy_config.yaml | 3 +- litellm/proxy/utils.py | 8 +- .../local_testing/test_prometheus_service.py | 85 +++++++++++++++++++ 4 files changed, 100 insertions(+), 11 deletions(-) diff --git a/litellm/integrations/prometheus_services.py b/litellm/integrations/prometheus_services.py index e657732db..a36ac9b9c 100644 --- a/litellm/integrations/prometheus_services.py +++ b/litellm/integrations/prometheus_services.py @@ -81,18 +81,17 @@ class PrometheusServicesLogger: return True return False - def get_metric(self, metric_name): - for metric in self.REGISTRY.collect(): - for sample in metric.samples: - if metric_name == sample.name: - return metric - return None + def _get_metric(self, metric_name): + """ + Helper function to get a metric from the registry by name. + """ + return self.REGISTRY._names_to_collectors.get(metric_name) def create_histogram(self, service: str, type_of_request: str): metric_name = "litellm_{}_{}".format(service, type_of_request) is_registered = self.is_metric_registered(metric_name) if is_registered: - return self.get_metric(metric_name) + return self._get_metric(metric_name) return self.Histogram( metric_name, "Latency for {} service".format(service), @@ -104,7 +103,7 @@ class PrometheusServicesLogger: metric_name = "litellm_{}_{}".format(service, type_of_request) is_registered = self.is_metric_registered(metric_name) if is_registered: - return self.get_metric(metric_name) + return self._get_metric(metric_name) return self.Counter( metric_name, "Total {} for {} service".format(type_of_request, service), diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 5bc044526..95eff095c 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -7,4 +7,5 @@ model_list: litellm_settings: callbacks: ["prometheus"] - service_callback: ["prometheus_system"] \ No newline at end of file + service_callback: ["prometheus_system"] + cache: true diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 9eab792ad..656e97a6c 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -138,6 +138,12 @@ def safe_deep_copy(data): def log_to_opentelemetry(func): + """ + Decorator to log the duration of a DB related function to ServiceLogger() + + Handles logging DB success/failure to ServiceLogger(), which logs to Prometheus, OTEL, Datadog + """ + @wraps(func) async def wrapper(*args, **kwargs): start_time: datetime = datetime.now() @@ -145,10 +151,8 @@ def log_to_opentelemetry(func): try: result = await func(*args, **kwargs) end_time: datetime = datetime.now() - from litellm.proxy.proxy_server import proxy_logging_obj - # Log to OTEL only if "parent_otel_span" is in kwargs and is not None if "PROXY" not in func.__name__: await proxy_logging_obj.service_logging_obj.async_service_success_hook( service=ServiceTypes.DB, diff --git a/tests/local_testing/test_prometheus_service.py b/tests/local_testing/test_prometheus_service.py index 49dd74839..c640532a0 100644 --- a/tests/local_testing/test_prometheus_service.py +++ b/tests/local_testing/test_prometheus_service.py @@ -210,3 +210,88 @@ async def test_service_logger_db_monitoring_failure(): assert actual_payload.call_type == "query" assert actual_payload.is_error is True assert actual_payload.error == "Database connection failed" + + +def test_get_metric_existing(): + """Test _get_metric when metric exists. _get_metric should return the metric object""" + pl = PrometheusServicesLogger() + # Create a metric first + hist = pl.create_histogram( + service="test_service", type_of_request="test_type_of_request" + ) + + # Test retrieving existing metric + retrieved_metric = pl._get_metric("litellm_test_service_test_type_of_request") + assert retrieved_metric is hist + assert retrieved_metric is not None + + +def test_get_metric_non_existing(): + """Test _get_metric when metric doesn't exist, returns None""" + pl = PrometheusServicesLogger() + + # Test retrieving non-existent metric + non_existent = pl._get_metric("non_existent_metric") + assert non_existent is None + + +def test_create_histogram_new(): + """Test creating a new histogram""" + pl = PrometheusServicesLogger() + + # Create new histogram + hist = pl.create_histogram( + service="test_service", type_of_request="test_type_of_request" + ) + + assert hist is not None + assert pl._get_metric("litellm_test_service_test_type_of_request") is hist + + +def test_create_histogram_existing(): + """Test creating a histogram that already exists""" + pl = PrometheusServicesLogger() + + # Create initial histogram + hist1 = pl.create_histogram( + service="test_service", type_of_request="test_type_of_request" + ) + + # Create same histogram again + hist2 = pl.create_histogram( + service="test_service", type_of_request="test_type_of_request" + ) + + assert hist2 is hist1 # same object + assert pl._get_metric("litellm_test_service_test_type_of_request") is hist1 + + +def test_create_counter_new(): + """Test creating a new counter""" + pl = PrometheusServicesLogger() + + # Create new counter + counter = pl.create_counter( + service="test_service", type_of_request="test_type_of_request" + ) + + assert counter is not None + assert pl._get_metric("litellm_test_service_test_type_of_request") is counter + + +def test_create_counter_existing(): + """Test creating a counter that already exists""" + pl = PrometheusServicesLogger() + + # Create initial counter + counter1 = pl.create_counter( + service="test_service", type_of_request="test_type_of_request" + ) + + # Create same counter again + counter2 = pl.create_counter( + service="test_service", type_of_request="test_type_of_request" + ) + + assert counter2 is counter1 + assert pl._get_metric("litellm_test_service_test_type_of_request") is counter1