mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 03:34:10 +00:00
refactor prom metrics
This commit is contained in:
parent
d382de7b74
commit
408d17dfee
3 changed files with 52 additions and 62 deletions
|
@ -136,19 +136,10 @@ class PrometheusLogger:
|
||||||
"api_provider",
|
"api_provider",
|
||||||
]
|
]
|
||||||
|
|
||||||
self.deployment_complete_outage = Gauge(
|
# Metric for deployment state
|
||||||
"deployment_complete_outage",
|
self.deployment_state = Gauge(
|
||||||
'Value is "1" when deployment is in cooldown and has had a complete outage',
|
"deployment_state",
|
||||||
labelnames=_logged_llm_labels,
|
"The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
|
||||||
)
|
|
||||||
self.deployment_partial_outage = Gauge(
|
|
||||||
"deployment_partial_outage",
|
|
||||||
'Value is "1" when deployment is experiencing a partial outage',
|
|
||||||
labelnames=_logged_llm_labels,
|
|
||||||
)
|
|
||||||
self.deployment_healthy = Gauge(
|
|
||||||
"deployment_healthy",
|
|
||||||
'Value is "1" when deployment is in an healthy state',
|
|
||||||
labelnames=_logged_llm_labels,
|
labelnames=_logged_llm_labels,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -311,7 +302,7 @@ class PrometheusLogger:
|
||||||
litellm_model_name=litellm_model_name,
|
litellm_model_name=litellm_model_name,
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
llm_provider=llm_provider,
|
api_provider=llm_provider,
|
||||||
)
|
)
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
@ -371,7 +362,7 @@ class PrometheusLogger:
|
||||||
litellm_model_name=litellm_model_name,
|
litellm_model_name=litellm_model_name,
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
llm_provider=llm_provider,
|
api_provider=llm_provider,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.error(
|
verbose_logger.error(
|
||||||
|
@ -381,63 +372,50 @@ class PrometheusLogger:
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def set_deployment_state(
|
||||||
|
self,
|
||||||
|
state: int,
|
||||||
|
litellm_model_name: str,
|
||||||
|
model_id: str,
|
||||||
|
api_base: str,
|
||||||
|
api_provider: str,
|
||||||
|
):
|
||||||
|
self.deployment_state.labels(
|
||||||
|
litellm_model_name, model_id, api_base, api_provider
|
||||||
|
).set(state)
|
||||||
|
|
||||||
def set_deployment_healthy(
|
def set_deployment_healthy(
|
||||||
self,
|
self,
|
||||||
litellm_model_name: str,
|
litellm_model_name: str,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
api_base: str,
|
api_base: str,
|
||||||
llm_provider: str,
|
api_provider: str,
|
||||||
):
|
):
|
||||||
self.deployment_complete_outage.labels(
|
self.set_deployment_state(
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
0, litellm_model_name, model_id, api_base, api_provider
|
||||||
).set(0)
|
)
|
||||||
|
|
||||||
self.deployment_partial_outage.labels(
|
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
|
||||||
).set(0)
|
|
||||||
|
|
||||||
self.deployment_healthy.labels(
|
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
|
||||||
).set(1)
|
|
||||||
|
|
||||||
def set_deployment_complete_outage(
|
|
||||||
self,
|
|
||||||
litellm_model_name: str,
|
|
||||||
model_id: str,
|
|
||||||
api_base: str,
|
|
||||||
llm_provider: str,
|
|
||||||
):
|
|
||||||
verbose_logger.debug("setting llm outage metric")
|
|
||||||
self.deployment_complete_outage.labels(
|
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
|
||||||
).set(1)
|
|
||||||
|
|
||||||
self.deployment_partial_outage.labels(
|
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
|
||||||
).set(0)
|
|
||||||
|
|
||||||
self.deployment_healthy.labels(
|
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
|
||||||
).set(0)
|
|
||||||
|
|
||||||
def set_deployment_partial_outage(
|
def set_deployment_partial_outage(
|
||||||
self,
|
self,
|
||||||
litellm_model_name: str,
|
litellm_model_name: str,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
api_base: str,
|
api_base: str,
|
||||||
llm_provider: str,
|
api_provider: str,
|
||||||
):
|
):
|
||||||
self.deployment_complete_outage.labels(
|
self.set_deployment_state(
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
1, litellm_model_name, model_id, api_base, api_provider
|
||||||
).set(0)
|
)
|
||||||
|
|
||||||
self.deployment_partial_outage.labels(
|
def set_deployment_complete_outage(
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
self,
|
||||||
).set(1)
|
litellm_model_name: str,
|
||||||
|
model_id: str,
|
||||||
self.deployment_healthy.labels(
|
api_base: str,
|
||||||
litellm_model_name, model_id, api_base, llm_provider
|
api_provider: str,
|
||||||
).set(0)
|
):
|
||||||
|
self.set_deployment_state(
|
||||||
|
2, litellm_model_name, model_id, api_base, api_provider
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def safe_get_remaining_budget(
|
def safe_get_remaining_budget(
|
||||||
|
|
|
@ -37,4 +37,5 @@ general_settings:
|
||||||
master_key: sk-1234
|
master_key: sk-1234
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
callbacks: ["otel"] # 👈 KEY CHANGE
|
success_callback: ["prometheus"]
|
||||||
|
failure_callback: ["prometheus"]
|
||||||
|
|
|
@ -38,6 +38,17 @@ async def router_cooldown_handler(
|
||||||
model_info = _deployment["model_info"]
|
model_info = _deployment["model_info"]
|
||||||
model_id = model_info.id
|
model_id = model_info.id
|
||||||
|
|
||||||
|
litellm_model_name = temp_litellm_params.get("model")
|
||||||
|
llm_provider = ""
|
||||||
|
try:
|
||||||
|
|
||||||
|
_, llm_provider, _, _ = litellm.get_llm_provider(
|
||||||
|
model=litellm_model_name,
|
||||||
|
custom_llm_provider=temp_litellm_params.get("custom_llm_provider"),
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Trigger cooldown on Prometheus
|
# Trigger cooldown on Prometheus
|
||||||
from litellm.litellm_core_utils.litellm_logging import prometheusLogger
|
from litellm.litellm_core_utils.litellm_logging import prometheusLogger
|
||||||
|
|
||||||
|
@ -45,7 +56,7 @@ async def router_cooldown_handler(
|
||||||
prometheusLogger.set_deployment_complete_outage(
|
prometheusLogger.set_deployment_complete_outage(
|
||||||
litellm_model_name=_model_name,
|
litellm_model_name=_model_name,
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
api_base="",
|
api_base=_api_base,
|
||||||
llm_provider="",
|
api_provider=llm_provider,
|
||||||
)
|
)
|
||||||
pass
|
return
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue