Merge pull request #9719 from BerriAI/litellm_metrics_pod_lock_manager

[Reliability] Emit operational metrics for new DB Transaction architecture
2025-04-25 02:34:29 +00:00 · 2025-04-04 21:12:06 -07:00 · 2025-04-04 21:12:06 -07:00 · 8c3670e192
commit 8c3670e192
parent 7cd7bdbd0f df51d8bcfa
11 changed files with 351 additions and 40 deletions
--- a/docs/my-website/docs/proxy/admin_ui_sso.md
+++ b/docs/my-website/docs/proxy/admin_ui_sso.md
@ -156,7 +156,7 @@ PROXY_LOGOUT_URL="https://www.google.com"
 Set this in your .env (so the proxy can set the correct redirect url)
 ```shell
-PROXY_BASE_URL=https://litellm-api.up.railway.app/
+PROXY_BASE_URL=https://litellm-api.up.railway.app
 ```
 #### Step 4. Test flow
--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -124,6 +124,7 @@ class ServiceLogging(CustomLogger):
            service=service,
            duration=duration,
            call_type=call_type,
            event_metadata=event_metadata,
        )
        for callback in litellm.service_callback:
@ -229,6 +230,7 @@ class ServiceLogging(CustomLogger):
            service=service,
            duration=duration,
            call_type=call_type,
            event_metadata=event_metadata,
        )
        for callback in litellm.service_callback:
--- a/litellm/integrations/prometheus_services.py
+++ b/litellm/integrations/prometheus_services.py
@ -3,11 +3,16 @@
 #    On success + failure, log events to Prometheus for litellm / adjacent services (litellm, redis, postgres, llm api providers)
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 from litellm._logging import print_verbose, verbose_logger
 from litellm.types.integrations.prometheus import LATENCY_BUCKETS
-from litellm.types.services import ServiceLoggerPayload, ServiceTypes
+from litellm.types.services import (
    DEFAULT_SERVICE_CONFIGS,
    ServiceLoggerPayload,
    ServiceMetrics,
    ServiceTypes,
 )
 FAILED_REQUESTS_LABELS = ["error_class", "function_name"]
@ -23,7 +28,8 @@ class PrometheusServicesLogger:
    ):
        try:
            try:
-                from prometheus_client import REGISTRY, Counter, Histogram
+                from prometheus_client import REGISTRY, Counter, Gauge, Histogram
                from prometheus_client.gc_collector import Collector
            except ImportError:
                raise Exception(
                    "Missing prometheus_client. Run `pip install prometheus-client`"
@ -31,36 +37,51 @@ class PrometheusServicesLogger:
            self.Histogram = Histogram
            self.Counter = Counter
            self.Gauge = Gauge
            self.REGISTRY = REGISTRY
            verbose_logger.debug("in init prometheus services metrics")
-            self.services = [item.value for item in ServiceTypes]
+            self.payload_to_prometheus_map: Dict[
                str, List[Union[Histogram, Counter, Gauge, Collector]]
            ] = {}
-            self.payload_to_prometheus_map = (
+            for service in ServiceTypes:
-                {}
+                service_metrics: List[Union[Histogram, Counter, Gauge, Collector]] = []
            )  # store the prometheus histogram/counter we need to call for each field in payload
-            for service in self.services:
+                metrics_to_initialize = self._get_service_metrics_initialize(service)
                histogram = self.create_histogram(service, type_of_request="latency")
                counter_failed_request = self.create_counter(
                    service,
                    type_of_request="failed_requests",
                    additional_labels=FAILED_REQUESTS_LABELS,
                )
                counter_total_requests = self.create_counter(
                    service, type_of_request="total_requests"
                )
                self.payload_to_prometheus_map[service] = [
                    histogram,
                    counter_failed_request,
                    counter_total_requests,
                ]
-            self.prometheus_to_amount_map: dict = (
+                # Initialize only the configured metrics for each service
-                {}
+                if ServiceMetrics.HISTOGRAM in metrics_to_initialize:
-            )  # the field / value in ServiceLoggerPayload the object needs to be incremented by
+                    histogram = self.create_histogram(
                        service.value, type_of_request="latency"
                    )
                    if histogram:
                        service_metrics.append(histogram)
                if ServiceMetrics.COUNTER in metrics_to_initialize:
                    counter_failed_request = self.create_counter(
                        service.value,
                        type_of_request="failed_requests",
                        additional_labels=FAILED_REQUESTS_LABELS,
                    )
                    if counter_failed_request:
                        service_metrics.append(counter_failed_request)
                    counter_total_requests = self.create_counter(
                        service.value, type_of_request="total_requests"
                    )
                    if counter_total_requests:
                        service_metrics.append(counter_total_requests)
                if ServiceMetrics.GAUGE in metrics_to_initialize:
                    gauge = self.create_gauge(service.value, type_of_request="size")
                    if gauge:
                        service_metrics.append(gauge)
                if service_metrics:
                    self.payload_to_prometheus_map[service.value] = service_metrics
            self.prometheus_to_amount_map: dict = {}
            ### MOCK TESTING ###
            self.mock_testing = mock_testing
            self.mock_testing_success_calls = 0
@ -70,6 +91,19 @@ class PrometheusServicesLogger:
            print_verbose(f"Got exception on init prometheus client {str(e)}")
            raise e
    def _get_service_metrics_initialize(
        self, service: ServiceTypes
    ) -> List[ServiceMetrics]:
        DEFAULT_METRICS = [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
        if service not in DEFAULT_SERVICE_CONFIGS:
            return DEFAULT_METRICS
        metrics = DEFAULT_SERVICE_CONFIGS.get(service, {}).get("metrics", [])
        if not metrics:
            verbose_logger.debug(f"No metrics found for service {service}")
            return DEFAULT_METRICS
        return metrics
    def is_metric_registered(self, metric_name) -> bool:
        for metric in self.REGISTRY.collect():
            if metric_name == metric.name:
@ -94,6 +128,15 @@ class PrometheusServicesLogger:
            buckets=LATENCY_BUCKETS,
        )
    def create_gauge(self, service: str, type_of_request: str):
        metric_name = "litellm_{}_{}".format(service, type_of_request)
        is_registered = self.is_metric_registered(metric_name)
        if is_registered:
            return self._get_metric(metric_name)
        return self.Gauge(
            metric_name, "Gauge for {} service".format(service), labelnames=[service]
        )
    def create_counter(
        self,
        service: str,
@ -120,6 +163,15 @@ class PrometheusServicesLogger:
        histogram.labels(labels).observe(amount)
    def update_gauge(
        self,
        gauge,
        labels: str,
        amount: float,
    ):
        assert isinstance(gauge, self.Gauge)
        gauge.labels(labels).set(amount)
    def increment_counter(
        self,
        counter,
@ -190,6 +242,13 @@ class PrometheusServicesLogger:
                        labels=payload.service.value,
                        amount=1,  # LOG TOTAL REQUESTS TO PROMETHEUS
                    )
                elif isinstance(obj, self.Gauge):
                    if payload.event_metadata:
                        self.update_gauge(
                            gauge=obj,
                            labels=payload.event_metadata.get("gauge_labels") or "",
                            amount=payload.event_metadata.get("gauge_value") or 0,
                        )
    async def async_service_failure_hook(
        self,
--- a/litellm/proxy/db/db_transaction_queue/base_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/base_update_queue.py
@ -2,8 +2,14 @@
 Base class for in memory buffer for database transactions
 """
 import asyncio
 from typing import Optional
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging
 service_logger_obj = (
    ServiceLogging()
 )  # used for tracking metrics for In memory buffer, redis buffer, pod lock manager
 from litellm.constants import MAX_IN_MEMORY_QUEUE_FLUSH_COUNT, MAX_SIZE_IN_MEMORY_QUEUE
@ -18,6 +24,9 @@ class BaseUpdateQueue:
        """Enqueue an update."""
        verbose_proxy_logger.debug("Adding update to queue: %s", update)
        await self.update_queue.put(update)
        await self._emit_new_item_added_to_queue_event(
            queue_size=self.update_queue.qsize()
        )
    async def flush_all_updates_from_in_memory_queue(self):
        """Get all updates from the queue."""
@ -31,3 +40,10 @@ class BaseUpdateQueue:
                break
            updates.append(await self.update_queue.get())
        return updates
    async def _emit_new_item_added_to_queue_event(
        self,
        queue_size: Optional[int] = None,
    ):
        """placeholder, emit event when a new item is added to the queue"""
        pass
--- a/litellm/proxy/db/db_transaction_queue/daily_spend_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/daily_spend_update_queue.py
@ -1,10 +1,14 @@
 import asyncio
 from copy import deepcopy
-from typing import Dict, List
+from typing import Dict, List, Optional
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import DailyUserSpendTransaction
-from litellm.proxy.db.db_transaction_queue.base_update_queue import BaseUpdateQueue
+from litellm.proxy.db.db_transaction_queue.base_update_queue import (
    BaseUpdateQueue,
    service_logger_obj,
 )
 from litellm.types.services import ServiceTypes
 class DailySpendUpdateQueue(BaseUpdateQueue):
@ -117,3 +121,19 @@ class DailySpendUpdateQueue(BaseUpdateQueue):
                else:
                    aggregated_daily_spend_update_transactions[_key] = deepcopy(payload)
        return aggregated_daily_spend_update_transactions
    async def _emit_new_item_added_to_queue_event(
        self,
        queue_size: Optional[int] = None,
    ):
        asyncio.create_task(
            service_logger_obj.async_service_success_hook(
                service=ServiceTypes.IN_MEMORY_DAILY_SPEND_UPDATE_QUEUE,
                duration=0,
                call_type="_emit_new_item_added_to_queue_event",
                event_metadata={
                    "gauge_labels": ServiceTypes.IN_MEMORY_DAILY_SPEND_UPDATE_QUEUE,
                    "gauge_value": queue_size,
                },
            )
        )
--- a/litellm/proxy/db/db_transaction_queue/pod_lock_manager.py
+++ b/litellm/proxy/db/db_transaction_queue/pod_lock_manager.py
@ -1,9 +1,12 @@
 import asyncio
 import uuid
 from typing import TYPE_CHECKING, Any, Optional
 from litellm._logging import verbose_proxy_logger
 from litellm.caching.redis_cache import RedisCache
 from litellm.constants import DEFAULT_CRON_JOB_LOCK_TTL_SECONDS
 from litellm.proxy.db.db_transaction_queue.base_update_queue import service_logger_obj
 from litellm.types.services import ServiceTypes
 if TYPE_CHECKING:
    ProxyLogging = Any
@ -57,6 +60,7 @@ class PodLockManager:
                    self.pod_id,
                    self.cronjob_id,
                )
                return True
            else:
                # Check if the current pod already holds the lock
@ -70,6 +74,7 @@ class PodLockManager:
                            self.pod_id,
                            self.cronjob_id,
                        )
                        self._emit_acquired_lock_event(self.cronjob_id, self.pod_id)
                        return True
            return False
        except Exception as e:
@ -104,6 +109,7 @@ class PodLockManager:
                            self.pod_id,
                            self.cronjob_id,
                        )
                        self._emit_released_lock_event(self.cronjob_id, self.pod_id)
                    else:
                        verbose_proxy_logger.debug(
                            "Pod %s failed to release Redis lock for cronjob_id=%s",
@ -127,3 +133,31 @@ class PodLockManager:
            verbose_proxy_logger.error(
                f"Error releasing Redis lock for {self.cronjob_id}: {e}"
            )
    @staticmethod
    def _emit_acquired_lock_event(cronjob_id: str, pod_id: str):
        asyncio.create_task(
            service_logger_obj.async_service_success_hook(
                service=ServiceTypes.POD_LOCK_MANAGER,
                duration=DEFAULT_CRON_JOB_LOCK_TTL_SECONDS,
                call_type="_emit_acquired_lock_event",
                event_metadata={
                    "gauge_labels": f"{cronjob_id}:{pod_id}",
                    "gauge_value": 1,
                },
            )
        )
    @staticmethod
    def _emit_released_lock_event(cronjob_id: str, pod_id: str):
        asyncio.create_task(
            service_logger_obj.async_service_success_hook(
                service=ServiceTypes.POD_LOCK_MANAGER,
                duration=DEFAULT_CRON_JOB_LOCK_TTL_SECONDS,
                call_type="_emit_released_lock_event",
                event_metadata={
                    "gauge_labels": f"{cronjob_id}:{pod_id}",
                    "gauge_value": 0,
                },
            )
        )
--- a/litellm/proxy/db/db_transaction_queue/redis_update_buffer.py
+++ b/litellm/proxy/db/db_transaction_queue/redis_update_buffer.py
@ -4,6 +4,7 @@ Handles buffering database `UPDATE` transactions in Redis before committing them
 This is to prevent deadlocks and improve reliability
 """
 import asyncio
 import json
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
@ -16,11 +17,13 @@ from litellm.constants import (
 )
 from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
 from litellm.proxy._types import DailyUserSpendTransaction, DBSpendUpdateTransactions
 from litellm.proxy.db.db_transaction_queue.base_update_queue import service_logger_obj
 from litellm.proxy.db.db_transaction_queue.daily_spend_update_queue import (
    DailySpendUpdateQueue,
 )
 from litellm.proxy.db.db_transaction_queue.spend_update_queue import SpendUpdateQueue
 from litellm.secret_managers.main import str_to_bool
 from litellm.types.services import ServiceTypes
 if TYPE_CHECKING:
    from litellm.proxy.utils import PrismaClient
@ -136,18 +139,27 @@ class RedisUpdateBuffer:
            return
        list_of_transactions = [safe_dumps(db_spend_update_transactions)]
-        await self.redis_cache.async_rpush(
+        current_redis_buffer_size = await self.redis_cache.async_rpush(
            key=REDIS_UPDATE_BUFFER_KEY,
            values=list_of_transactions,
        )
        await self._emit_new_item_added_to_redis_buffer_event(
            queue_size=current_redis_buffer_size,
            service=ServiceTypes.REDIS_SPEND_UPDATE_QUEUE,
        )
        list_of_daily_spend_update_transactions = [
            safe_dumps(daily_spend_update_transactions)
        ]
-        await self.redis_cache.async_rpush(
+
        current_redis_buffer_size = await self.redis_cache.async_rpush(
            key=REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY,
            values=list_of_daily_spend_update_transactions,
        )
        await self._emit_new_item_added_to_redis_buffer_event(
            queue_size=current_redis_buffer_size,
            service=ServiceTypes.REDIS_DAILY_SPEND_UPDATE_QUEUE,
        )
    @staticmethod
    def _number_of_transactions_to_store_in_redis(
@ -300,3 +312,20 @@ class RedisUpdateBuffer:
                        )
        return combined_transaction
    async def _emit_new_item_added_to_redis_buffer_event(
        self,
        service: ServiceTypes,
        queue_size: int,
    ):
        asyncio.create_task(
            service_logger_obj.async_service_success_hook(
                service=service,
                duration=0,
                call_type="_emit_new_item_added_to_queue_event",
                event_metadata={
                    "gauge_labels": service,
                    "gauge_value": queue_size,
                },
            )
        )
--- a/litellm/proxy/db/db_transaction_queue/spend_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/spend_update_queue.py
@ -1,5 +1,5 @@
 import asyncio
-from typing import Dict, List
+from typing import Dict, List, Optional
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import (
@ -7,7 +7,11 @@ from litellm.proxy._types import (
    Litellm_EntityType,
    SpendUpdateQueueItem,
 )
-from litellm.proxy.db.db_transaction_queue.base_update_queue import BaseUpdateQueue
+from litellm.proxy.db.db_transaction_queue.base_update_queue import (
    BaseUpdateQueue,
    service_logger_obj,
 )
 from litellm.types.services import ServiceTypes
 class SpendUpdateQueue(BaseUpdateQueue):
@ -203,3 +207,19 @@ class SpendUpdateQueue(BaseUpdateQueue):
            transactions_dict[entity_id] += response_cost or 0
        return db_spend_update_transactions
    async def _emit_new_item_added_to_queue_event(
        self,
        queue_size: Optional[int] = None,
    ):
        asyncio.create_task(
            service_logger_obj.async_service_success_hook(
                service=ServiceTypes.IN_MEMORY_SPEND_UPDATE_QUEUE,
                duration=0,
                call_type="_emit_new_item_added_to_queue_event",
                event_metadata={
                    "gauge_labels": ServiceTypes.IN_MEMORY_SPEND_UPDATE_QUEUE,
                    "gauge_value": queue_size,
                },
            )
        )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -5,11 +5,6 @@ model_list:
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings:
  use_redis_transaction_buffer: true
 litellm_settings:
-  cache: True
+  callbacks: ["prometheus"]
-  cache_params:
+  service_callback: ["prometheus_system"]
    type: redis
    supported_call_types: []
--- a/litellm/types/services.py
+++ b/litellm/types/services.py
@ -1,8 +1,15 @@
 import enum
 import uuid
-from typing import Optional
+from typing import List, Optional
 from pydantic import BaseModel, Field
 from typing_extensions import TypedDict
 class ServiceMetrics(enum.Enum):
    COUNTER = "counter"
    HISTOGRAM = "histogram"
    GAUGE = "gauge"
 class ServiceTypes(str, enum.Enum):
@ -18,6 +25,84 @@ class ServiceTypes(str, enum.Enum):
    ROUTER = "router"
    AUTH = "auth"
    PROXY_PRE_CALL = "proxy_pre_call"
    POD_LOCK_MANAGER = "pod_lock_manager"
    """
    Operational metrics for DB Transaction Queues
    """
    # daily spend update queue - actual transaction events
    IN_MEMORY_DAILY_SPEND_UPDATE_QUEUE = "in_memory_daily_spend_update_queue"
    REDIS_DAILY_SPEND_UPDATE_QUEUE = "redis_daily_spend_update_queue"
    # spend update queue - current spend of key, user, team
    IN_MEMORY_SPEND_UPDATE_QUEUE = "in_memory_spend_update_queue"
    REDIS_SPEND_UPDATE_QUEUE = "redis_spend_update_queue"
 class ServiceConfig(TypedDict):
    """
    Configuration for services and their metrics
    """
    metrics: List[ServiceMetrics]  # What metrics this service should support
 """
 Metric types to use for each service
 - REDIS only needs Counter, Histogram
 - Pod Lock Manager only needs a gauge metric
 """
 DEFAULT_SERVICE_CONFIGS = {
    ServiceTypes.REDIS.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.DB.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.BATCH_WRITE_TO_DB.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.RESET_BUDGET_JOB.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.LITELLM.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.ROUTER.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.AUTH.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    ServiceTypes.PROXY_PRE_CALL.value: {
        "metrics": [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
    },
    # Operational metrics for DB Transaction Queues
    ServiceTypes.POD_LOCK_MANAGER.value: {"metrics": [ServiceMetrics.GAUGE]},
    ServiceTypes.IN_MEMORY_DAILY_SPEND_UPDATE_QUEUE.value: {
        "metrics": [ServiceMetrics.GAUGE]
    },
    ServiceTypes.REDIS_DAILY_SPEND_UPDATE_QUEUE.value: {
        "metrics": [ServiceMetrics.GAUGE]
    },
    ServiceTypes.IN_MEMORY_SPEND_UPDATE_QUEUE.value: {
        "metrics": [ServiceMetrics.GAUGE]
    },
    ServiceTypes.REDIS_SPEND_UPDATE_QUEUE.value: {"metrics": [ServiceMetrics.GAUGE]},
 }
 class ServiceEventMetadata(TypedDict, total=False):
    """
    The metadata logged during service success/failure
    Add any extra fields you expect to access in the service_success_hook/service_failure_hook
    """
    # Dynamically control gauge labels and values
    gauge_labels: Optional[str]
    gauge_value: Optional[float]
 class ServiceLoggerPayload(BaseModel):
@ -30,6 +115,9 @@ class ServiceLoggerPayload(BaseModel):
    service: ServiceTypes = Field(description="who is this for? - postgres/redis")
    duration: float = Field(description="How long did the request take?")
    call_type: str = Field(description="The call of the service, being made")
    event_metadata: Optional[dict] = Field(
        description="The metadata logged during service success/failure"
    )
    def to_json(self, **kwargs):
        try:
--- a/tests/litellm/integrations/test_prometheus_services.py
+++ b/tests/litellm/integrations/test_prometheus_services.py
@ -0,0 +1,48 @@
 import json
 import os
 import sys
 from unittest.mock import AsyncMock, patch
 import pytest
 from fastapi.testclient import TestClient
 from litellm.integrations.prometheus_services import (
    PrometheusServicesLogger,
    ServiceMetrics,
    ServiceTypes,
 )
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 def test_create_gauge_new():
    """Test creating a new gauge"""
    pl = PrometheusServicesLogger()
    # Create new gauge
    gauge = pl.create_gauge(service="test_service", type_of_request="size")
    assert gauge is not None
    assert pl._get_metric("litellm_test_service_size") is gauge
 def test_update_gauge():
    """Test updating a gauge's value"""
    pl = PrometheusServicesLogger()
    # Create a gauge to test with
    gauge = pl.create_gauge(service="test_service", type_of_request="size")
    # Mock the labels method to verify it's called correctly
    with patch.object(gauge, "labels") as mock_labels:
        mock_gauge = AsyncMock()
        mock_labels.return_value = mock_gauge
        # Call update_gauge
        pl.update_gauge(gauge=gauge, labels="test_label", amount=42.5)
        # Verify correct methods were called
        mock_labels.assert_called_once_with("test_label")
        mock_gauge.set.assert_called_once_with(42.5)