refactor prom metrics

2025-04-26 19:24:27 +00:00 · 2024-08-09 09:02:23 -07:00 · 2024-08-09 09:02:23 -07:00 · 408d17dfee
commit 408d17dfee
parent d382de7b74
3 changed files with 52 additions and 62 deletions
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -136,19 +136,10 @@ class PrometheusLogger:
                    "api_provider",
                ]

-                self.deployment_complete_outage = Gauge(
-                    "deployment_complete_outage",
-                    'Value is "1" when deployment is in cooldown and has had a complete outage',
-                    labelnames=_logged_llm_labels,
-                )
-                self.deployment_partial_outage = Gauge(
-                    "deployment_partial_outage",
-                    'Value is "1" when deployment is experiencing a partial outage',
-                    labelnames=_logged_llm_labels,
-                )
-                self.deployment_healthy = Gauge(
-                    "deployment_healthy",
-                    'Value is "1" when deployment is in an healthy state',
+                # Metric for deployment state
+                self.deployment_state = Gauge(
+                    "deployment_state",
+                    "The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
                    labelnames=_logged_llm_labels,
                )

@ -311,7 +302,7 @@ class PrometheusLogger:
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
-                llm_provider=llm_provider,
+                api_provider=llm_provider,
            )

            pass
@ -371,7 +362,7 @@ class PrometheusLogger:
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
-                llm_provider=llm_provider,
+                api_provider=llm_provider,
            )
        except Exception as e:
            verbose_logger.error(
@ -381,63 +372,50 @@ class PrometheusLogger:
            )
            return

+    def set_deployment_state(
+        self,
+        state: int,
+        litellm_model_name: str,
+        model_id: str,
+        api_base: str,
+        api_provider: str,
+    ):
+        self.deployment_state.labels(
+            litellm_model_name, model_id, api_base, api_provider
+        ).set(state)
+
    def set_deployment_healthy(
        self,
        litellm_model_name: str,
        model_id: str,
        api_base: str,
-        llm_provider: str,
+        api_provider: str,
    ):
-        self.deployment_complete_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
-
-        self.deployment_partial_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
-
-        self.deployment_healthy.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(1)
-
-    def set_deployment_complete_outage(
-        self,
-        litellm_model_name: str,
-        model_id: str,
-        api_base: str,
-        llm_provider: str,
-    ):
-        verbose_logger.debug("setting llm outage metric")
-        self.deployment_complete_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(1)
-
-        self.deployment_partial_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
-
-        self.deployment_healthy.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
+        self.set_deployment_state(
+            0, litellm_model_name, model_id, api_base, api_provider
+        )

    def set_deployment_partial_outage(
        self,
        litellm_model_name: str,
        model_id: str,
        api_base: str,
-        llm_provider: str,
+        api_provider: str,
    ):
-        self.deployment_complete_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
+        self.set_deployment_state(
+            1, litellm_model_name, model_id, api_base, api_provider
+        )

-        self.deployment_partial_outage.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(1)
-
-        self.deployment_healthy.labels(
-            litellm_model_name, model_id, api_base, llm_provider
-        ).set(0)
+    def set_deployment_complete_outage(
+        self,
+        litellm_model_name: str,
+        model_id: str,
+        api_base: str,
+        api_provider: str,
+    ):
+        self.set_deployment_state(
+            2, litellm_model_name, model_id, api_base, api_provider
+        )


 def safe_get_remaining_budget(
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -37,4 +37,5 @@ general_settings:
  master_key: sk-1234

 litellm_settings:
-  callbacks: ["otel"] # 👈 KEY CHANGE
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
--- a/litellm/router_utils/cooldown_callbacks.py
+++ b/litellm/router_utils/cooldown_callbacks.py
@ -38,6 +38,17 @@ async def router_cooldown_handler(
    model_info = _deployment["model_info"]
    model_id = model_info.id

+    litellm_model_name = temp_litellm_params.get("model")
+    llm_provider = ""
+    try:
+
+        _, llm_provider, _, _ = litellm.get_llm_provider(
+            model=litellm_model_name,
+            custom_llm_provider=temp_litellm_params.get("custom_llm_provider"),
+        )
+    except:
+        pass
+
    # Trigger cooldown on Prometheus
    from litellm.litellm_core_utils.litellm_logging import prometheusLogger

@ -45,7 +56,7 @@ async def router_cooldown_handler(
        prometheusLogger.set_deployment_complete_outage(
            litellm_model_name=_model_name,
            model_id=model_id,
-            api_base="",
-            llm_provider="",
+            api_base=_api_base,
+            api_provider=llm_provider,
        )
-    pass
+    return