From 27e8a890776e3a60ed5f1aa806449b410ae7e23f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 7 Aug 2024 11:27:05 -0700 Subject: [PATCH] fix logging cool down deployment --- litellm/integrations/prometheus.py | 102 +++++++++++++++------ litellm/router_utils/cooldown_callbacks.py | 4 +- 2 files changed, 75 insertions(+), 31 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 0c8df96bfb..06ec711862 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -132,9 +132,9 @@ class PrometheusLogger: "api_provider", ] - self.deployment_unhealthy = Gauge( - "deployment_unhealthy", - 'Value is "1" when deployment is in an unhealthy state', + self.deployment_complete_outage = Gauge( + "deployment_complete_outage", + 'Value is "1" when deployment is in cooldown and has had a complete outage', labelnames=_logged_llm_labels, ) self.deployment_partial_outage = Gauge( @@ -303,34 +303,17 @@ class PrometheusLogger: log these labels ["litellm_model_name", "model_id", "api_base", "api_provider"] """ - self.deployment_partial_outage.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) - - self.deployment_healthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(0) + self.set_deployment_partial_outage( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + llm_provider=llm_provider, + ) pass except: pass - def set_llm_outage_metric( - self, - litellm_model_name: str, - model_id: str, - api_base: str, - llm_provider: str, - ): - """ - log these labels - ["litellm_model_name", "model_id", "api_base", "api_provider"] - """ - self.deployment_unhealthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) - pass - def set_llm_deployment_success_metrics(self, request_kwargs: dict): try: verbose_logger.debug("setting remaining tokens requests metric") @@ -380,9 +363,12 @@ class PrometheusLogger: log these labels ["litellm_model_name", "model_id", "api_base", "api_provider"] """ - self.deployment_healthy.labels( - litellm_model_name, model_id, api_base, llm_provider - ).set(1) + self.set_deployment_healthy( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + llm_provider=llm_provider, + ) except Exception as e: verbose_logger.error( "Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format( @@ -391,6 +377,64 @@ class PrometheusLogger: ) return + def set_deployment_healthy( + self, + litellm_model_name: str, + model_id: str, + api_base: str, + llm_provider: str, + ): + self.deployment_complete_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + + self.deployment_partial_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + + self.deployment_healthy.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(1) + + def set_deployment_complete_outage( + self, + litellm_model_name: str, + model_id: str, + api_base: str, + llm_provider: str, + ): + verbose_logger.debug("setting llm outage metric") + self.deployment_complete_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(1) + + self.deployment_partial_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + + self.deployment_healthy.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + + def set_deployment_partial_outage( + self, + litellm_model_name: str, + model_id: str, + api_base: str, + llm_provider: str, + ): + self.deployment_complete_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + + self.deployment_partial_outage.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(1) + + self.deployment_healthy.labels( + litellm_model_name, model_id, api_base, llm_provider + ).set(0) + def safe_get_remaining_budget( max_budget: Optional[float], spend: Optional[float] diff --git a/litellm/router_utils/cooldown_callbacks.py b/litellm/router_utils/cooldown_callbacks.py index 00e89274bc..3a5213ec03 100644 --- a/litellm/router_utils/cooldown_callbacks.py +++ b/litellm/router_utils/cooldown_callbacks.py @@ -42,10 +42,10 @@ async def router_cooldown_handler( from litellm.litellm_core_utils.litellm_logging import prometheusLogger if prometheusLogger is not None: - prometheusLogger.set_llm_outage_metric( + prometheusLogger.set_deployment_complete_outage( litellm_model_name=_model_name, model_id=model_id, api_base="", - api_provider="", + llm_provider="", ) pass