From 408d17dfee440be8dc9a6ec321e0cfcbb8b0992a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 9 Aug 2024 09:02:23 -0700 Subject: [PATCH] refactor prom metrics --- litellm/integrations/prometheus.py | 94 +++++++++------------- litellm/proxy/proxy_config.yaml | 3 +- litellm/router_utils/cooldown_callbacks.py | 17 +++- 3 files changed, 52 insertions(+), 62 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 61f4ff02a6..6160e4d33e 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -136,19 +136,10 @@ class PrometheusLogger: "api_provider", ] - self.deployment_complete_outage = Gauge( - "deployment_complete_outage", - 'Value is "1" when deployment is in cooldown and has had a complete outage', - labelnames=_logged_llm_labels, - ) - self.deployment_partial_outage = Gauge( - "deployment_partial_outage", - 'Value is "1" when deployment is experiencing a partial outage', - labelnames=_logged_llm_labels, - ) - self.deployment_healthy = Gauge( - "deployment_healthy", - 'Value is "1" when deployment is in an healthy state', + # Metric for deployment state + self.deployment_state = Gauge( + "deployment_state", + "The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", labelnames=_logged_llm_labels, ) @@ -311,7 +302,7 @@ class PrometheusLogger: litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, - llm_provider=llm_provider, + api_provider=llm_provider, ) pass @@ -371,7 +362,7 @@ class PrometheusLogger: litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, - llm_provider=llm_provider, + api_provider=llm_provider, ) except Exception as e: verbose_logger.error( @@ -381,63 +372,50 @@ class PrometheusLogger: ) return + def set_deployment_state( + self, + state: int, + litellm_model_name: str, + model_id: str, + api_base: str, + api_provider: str, + ): + self.deployment_state.labels( + litellm_model_name, model_id, api_base, api_provider + ).set(state) + def set_deployment_healthy( self, litellm_model_name: str, model_id: str, api_base: str, - llm_provider: str, + api_provider: str, ): - self.deployment_complete_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) - - self.deployment_partial_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) - - self.deployment_healthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) - - def set_deployment_complete_outage( - self, - litellm_model_name: str, - model_id: str, - api_base: str, - llm_provider: str, - ): - verbose_logger.debug("setting llm outage metric") - self.deployment_complete_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) - - self.deployment_partial_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) - - self.deployment_healthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) + self.set_deployment_state( + 0, litellm_model_name, model_id, api_base, api_provider + ) def set_deployment_partial_outage( self, litellm_model_name: str, model_id: str, api_base: str, - llm_provider: str, + api_provider: str, ): - self.deployment_complete_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) + self.set_deployment_state( + 1, litellm_model_name, model_id, api_base, api_provider + ) - self.deployment_partial_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) - - self.deployment_healthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) + def set_deployment_complete_outage( + self, + litellm_model_name: str, + model_id: str, + api_base: str, + api_provider: str, + ): + self.set_deployment_state( + 2, litellm_model_name, model_id, api_base, api_provider + ) def safe_get_remaining_budget( diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index b0a5a073e6..5af3b9360b 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -37,4 +37,5 @@ general_settings: master_key: sk-1234 litellm_settings: - callbacks: ["otel"] # 👈 KEY CHANGE + success_callback: ["prometheus"] + failure_callback: ["prometheus"] diff --git a/litellm/router_utils/cooldown_callbacks.py b/litellm/router_utils/cooldown_callbacks.py index 3a5213ec03..6610750478 100644 --- a/litellm/router_utils/cooldown_callbacks.py +++ b/litellm/router_utils/cooldown_callbacks.py @@ -38,6 +38,17 @@ async def router_cooldown_handler( model_info = _deployment["model_info"] model_id = model_info.id + litellm_model_name = temp_litellm_params.get("model") + llm_provider = "" + try: + + _, llm_provider, _, _ = litellm.get_llm_provider( + model=litellm_model_name, + custom_llm_provider=temp_litellm_params.get("custom_llm_provider"), + ) + except: + pass + # Trigger cooldown on Prometheus from litellm.litellm_core_utils.litellm_logging import prometheusLogger @@ -45,7 +56,7 @@ async def router_cooldown_handler( prometheusLogger.set_deployment_complete_outage( litellm_model_name=_model_name, model_id=model_id, - api_base="", - llm_provider="", + api_base=_api_base, + api_provider=llm_provider, ) - pass + return