refactor prom metrics

This commit is contained in:
Ishaan Jaff 2024-08-09 09:02:23 -07:00
parent d382de7b74
commit 408d17dfee
3 changed files with 52 additions and 62 deletions

View file

@ -136,19 +136,10 @@ class PrometheusLogger:
"api_provider", "api_provider",
] ]
self.deployment_complete_outage = Gauge( # Metric for deployment state
"deployment_complete_outage", self.deployment_state = Gauge(
'Value is "1" when deployment is in cooldown and has had a complete outage', "deployment_state",
labelnames=_logged_llm_labels, "The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
)
self.deployment_partial_outage = Gauge(
"deployment_partial_outage",
'Value is "1" when deployment is experiencing a partial outage',
labelnames=_logged_llm_labels,
)
self.deployment_healthy = Gauge(
"deployment_healthy",
'Value is "1" when deployment is in an healthy state',
labelnames=_logged_llm_labels, labelnames=_logged_llm_labels,
) )
@ -311,7 +302,7 @@ class PrometheusLogger:
litellm_model_name=litellm_model_name, litellm_model_name=litellm_model_name,
model_id=model_id, model_id=model_id,
api_base=api_base, api_base=api_base,
llm_provider=llm_provider, api_provider=llm_provider,
) )
pass pass
@ -371,7 +362,7 @@ class PrometheusLogger:
litellm_model_name=litellm_model_name, litellm_model_name=litellm_model_name,
model_id=model_id, model_id=model_id,
api_base=api_base, api_base=api_base,
llm_provider=llm_provider, api_provider=llm_provider,
) )
except Exception as e: except Exception as e:
verbose_logger.error( verbose_logger.error(
@ -381,63 +372,50 @@ class PrometheusLogger:
) )
return return
def set_deployment_state(
self,
state: int,
litellm_model_name: str,
model_id: str,
api_base: str,
api_provider: str,
):
self.deployment_state.labels(
litellm_model_name, model_id, api_base, api_provider
).set(state)
def set_deployment_healthy( def set_deployment_healthy(
self, self,
litellm_model_name: str, litellm_model_name: str,
model_id: str, model_id: str,
api_base: str, api_base: str,
llm_provider: str, api_provider: str,
): ):
self.deployment_complete_outage.labels( self.set_deployment_state(
litellm_model_name, model_id, api_base, llm_provider 0, litellm_model_name, model_id, api_base, api_provider
).set(0) )
self.deployment_partial_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_healthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
def set_deployment_complete_outage(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
llm_provider: str,
):
verbose_logger.debug("setting llm outage metric")
self.deployment_complete_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
self.deployment_partial_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_healthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
def set_deployment_partial_outage( def set_deployment_partial_outage(
self, self,
litellm_model_name: str, litellm_model_name: str,
model_id: str, model_id: str,
api_base: str, api_base: str,
llm_provider: str, api_provider: str,
): ):
self.deployment_complete_outage.labels( self.set_deployment_state(
litellm_model_name, model_id, api_base, llm_provider 1, litellm_model_name, model_id, api_base, api_provider
).set(0) )
self.deployment_partial_outage.labels( def set_deployment_complete_outage(
litellm_model_name, model_id, api_base, llm_provider self,
).set(1) litellm_model_name: str,
model_id: str,
self.deployment_healthy.labels( api_base: str,
litellm_model_name, model_id, api_base, llm_provider api_provider: str,
).set(0) ):
self.set_deployment_state(
2, litellm_model_name, model_id, api_base, api_provider
)
def safe_get_remaining_budget( def safe_get_remaining_budget(

View file

@ -37,4 +37,5 @@ general_settings:
master_key: sk-1234 master_key: sk-1234
litellm_settings: litellm_settings:
callbacks: ["otel"] # 👈 KEY CHANGE success_callback: ["prometheus"]
failure_callback: ["prometheus"]

View file

@ -38,6 +38,17 @@ async def router_cooldown_handler(
model_info = _deployment["model_info"] model_info = _deployment["model_info"]
model_id = model_info.id model_id = model_info.id
litellm_model_name = temp_litellm_params.get("model")
llm_provider = ""
try:
_, llm_provider, _, _ = litellm.get_llm_provider(
model=litellm_model_name,
custom_llm_provider=temp_litellm_params.get("custom_llm_provider"),
)
except:
pass
# Trigger cooldown on Prometheus # Trigger cooldown on Prometheus
from litellm.litellm_core_utils.litellm_logging import prometheusLogger from litellm.litellm_core_utils.litellm_logging import prometheusLogger
@ -45,7 +56,7 @@ async def router_cooldown_handler(
prometheusLogger.set_deployment_complete_outage( prometheusLogger.set_deployment_complete_outage(
litellm_model_name=_model_name, litellm_model_name=_model_name,
model_id=model_id, model_id=model_id,
api_base="", api_base=_api_base,
llm_provider="", api_provider=llm_provider,
) )
pass return