fix logging cool down deployment

This commit is contained in:
Ishaan Jaff 2024-08-07 11:27:05 -07:00
parent 0dd8f50477
commit 27e8a89077
2 changed files with 75 additions and 31 deletions

View file

@ -132,9 +132,9 @@ class PrometheusLogger:
"api_provider", "api_provider",
] ]
self.deployment_unhealthy = Gauge( self.deployment_complete_outage = Gauge(
"deployment_unhealthy", "deployment_complete_outage",
'Value is "1" when deployment is in an unhealthy state', 'Value is "1" when deployment is in cooldown and has had a complete outage',
labelnames=_logged_llm_labels, labelnames=_logged_llm_labels,
) )
self.deployment_partial_outage = Gauge( self.deployment_partial_outage = Gauge(
@ -303,34 +303,17 @@ class PrometheusLogger:
log these labels log these labels
["litellm_model_name", "model_id", "api_base", "api_provider"] ["litellm_model_name", "model_id", "api_base", "api_provider"]
""" """
self.deployment_partial_outage.labels( self.set_deployment_partial_outage(
litellm_model_name, model_id, api_base, llm_provider litellm_model_name=litellm_model_name,
).set(1) model_id=model_id,
api_base=api_base,
self.deployment_healthy.labels( llm_provider=llm_provider,
litellm_model_name, model_id, api_base, llm_provider )
).set(0)
pass pass
except: except:
pass pass
def set_llm_outage_metric(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
llm_provider: str,
):
"""
log these labels
["litellm_model_name", "model_id", "api_base", "api_provider"]
"""
self.deployment_unhealthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
pass
def set_llm_deployment_success_metrics(self, request_kwargs: dict): def set_llm_deployment_success_metrics(self, request_kwargs: dict):
try: try:
verbose_logger.debug("setting remaining tokens requests metric") verbose_logger.debug("setting remaining tokens requests metric")
@ -380,9 +363,12 @@ class PrometheusLogger:
log these labels log these labels
["litellm_model_name", "model_id", "api_base", "api_provider"] ["litellm_model_name", "model_id", "api_base", "api_provider"]
""" """
self.deployment_healthy.labels( self.set_deployment_healthy(
litellm_model_name, model_id, api_base, llm_provider litellm_model_name=litellm_model_name,
).set(1) model_id=model_id,
api_base=api_base,
llm_provider=llm_provider,
)
except Exception as e: except Exception as e:
verbose_logger.error( verbose_logger.error(
"Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format( "Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format(
@ -391,6 +377,64 @@ class PrometheusLogger:
) )
return return
def set_deployment_healthy(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
llm_provider: str,
):
self.deployment_complete_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_partial_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_healthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
def set_deployment_complete_outage(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
llm_provider: str,
):
verbose_logger.debug("setting llm outage metric")
self.deployment_complete_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
self.deployment_partial_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_healthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
def set_deployment_partial_outage(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
llm_provider: str,
):
self.deployment_complete_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
self.deployment_partial_outage.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(1)
self.deployment_healthy.labels(
litellm_model_name, model_id, api_base, llm_provider
).set(0)
def safe_get_remaining_budget( def safe_get_remaining_budget(
max_budget: Optional[float], spend: Optional[float] max_budget: Optional[float], spend: Optional[float]

View file

@ -42,10 +42,10 @@ async def router_cooldown_handler(
from litellm.litellm_core_utils.litellm_logging import prometheusLogger from litellm.litellm_core_utils.litellm_logging import prometheusLogger
if prometheusLogger is not None: if prometheusLogger is not None:
prometheusLogger.set_llm_outage_metric( prometheusLogger.set_deployment_complete_outage(
litellm_model_name=_model_name, litellm_model_name=_model_name,
model_id=model_id, model_id=model_id,
api_base="", api_base="",
api_provider="", llm_provider="",
) )
pass pass