forked from phoenix/litellm-mirror
use router_cooldown_handler
This commit is contained in:
parent
99ecde7744
commit
d1e519afd1
2 changed files with 60 additions and 40 deletions
|
@ -57,6 +57,7 @@ from litellm.router_utils.client_initalization_utils import (
|
||||||
set_client,
|
set_client,
|
||||||
should_initialize_sync_client,
|
should_initialize_sync_client,
|
||||||
)
|
)
|
||||||
|
from litellm.router_utils.cooldown_callbacks import router_cooldown_handler
|
||||||
from litellm.router_utils.handle_error import send_llm_exception_alert
|
from litellm.router_utils.handle_error import send_llm_exception_alert
|
||||||
from litellm.scheduler import FlowItem, Scheduler
|
from litellm.scheduler import FlowItem, Scheduler
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
|
@ -3294,11 +3295,15 @@ class Router:
|
||||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
)
|
)
|
||||||
|
|
||||||
self.send_deployment_cooldown_alert(
|
# Trigger cooldown handler
|
||||||
|
asyncio.create_task(
|
||||||
|
router_cooldown_handler(
|
||||||
|
litellm_router_instance=self,
|
||||||
deployment_id=deployment,
|
deployment_id=deployment,
|
||||||
exception_status=exception_status,
|
exception_status=exception_status,
|
||||||
cooldown_time=cooldown_time,
|
cooldown_time=cooldown_time,
|
||||||
)
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.failed_calls.set_cache(
|
self.failed_calls.set_cache(
|
||||||
key=deployment, value=updated_fails, ttl=cooldown_time
|
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||||
|
@ -4948,42 +4953,6 @@ class Router:
|
||||||
)
|
)
|
||||||
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
||||||
|
|
||||||
def send_deployment_cooldown_alert(
|
|
||||||
self,
|
|
||||||
deployment_id: str,
|
|
||||||
exception_status: Union[str, int],
|
|
||||||
cooldown_time: float,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
from litellm.proxy.proxy_server import proxy_logging_obj
|
|
||||||
|
|
||||||
# trigger slack alert saying deployment is in cooldown
|
|
||||||
if (
|
|
||||||
proxy_logging_obj is not None
|
|
||||||
and proxy_logging_obj.alerting is not None
|
|
||||||
and "slack" in proxy_logging_obj.alerting
|
|
||||||
):
|
|
||||||
_deployment = self.get_deployment(model_id=deployment_id)
|
|
||||||
if _deployment is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
_litellm_params = _deployment["litellm_params"]
|
|
||||||
temp_litellm_params = copy.deepcopy(_litellm_params)
|
|
||||||
temp_litellm_params = dict(temp_litellm_params)
|
|
||||||
_model_name = _deployment.get("model_name", None)
|
|
||||||
_api_base = litellm.get_api_base(
|
|
||||||
model=_model_name, optional_params=temp_litellm_params
|
|
||||||
)
|
|
||||||
# asyncio.create_task(
|
|
||||||
# proxy_logging_obj.slack_alerting_instance.send_alert(
|
|
||||||
# message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
|
|
||||||
# alert_type="cooldown_deployment",
|
|
||||||
# level="Low",
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def set_custom_routing_strategy(
|
def set_custom_routing_strategy(
|
||||||
self, CustomRoutingStrategy: CustomRoutingStrategyBase
|
self, CustomRoutingStrategy: CustomRoutingStrategyBase
|
||||||
):
|
):
|
||||||
|
|
51
litellm/router_utils/cooldown_callbacks.py
Normal file
51
litellm/router_utils/cooldown_callbacks.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
"""
|
||||||
|
Callbacks triggered on cooling down deployments
|
||||||
|
"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
from typing import TYPE_CHECKING, Any, Union
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm._logging import verbose_logger
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.router import Router as _Router
|
||||||
|
|
||||||
|
LitellmRouter = _Router
|
||||||
|
else:
|
||||||
|
LitellmRouter = Any
|
||||||
|
|
||||||
|
|
||||||
|
async def router_cooldown_handler(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
cooldown_time: float,
|
||||||
|
):
|
||||||
|
_deployment = litellm_router_instance.get_deployment(model_id=deployment_id)
|
||||||
|
if _deployment is None:
|
||||||
|
verbose_logger.warning(
|
||||||
|
f"in router_cooldown_handler but _deployment is None for deployment_id={deployment_id}. Doing nothing"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
_litellm_params = _deployment["litellm_params"]
|
||||||
|
temp_litellm_params = copy.deepcopy(_litellm_params)
|
||||||
|
temp_litellm_params = dict(temp_litellm_params)
|
||||||
|
_model_name = _deployment.get("model_name", None)
|
||||||
|
_api_base = litellm.get_api_base(
|
||||||
|
model=_model_name, optional_params=temp_litellm_params
|
||||||
|
)
|
||||||
|
model_info = _deployment["model_info"]
|
||||||
|
model_id = model_info.id
|
||||||
|
|
||||||
|
# Trigger cooldown on Prometheus
|
||||||
|
from litellm.litellm_core_utils.litellm_logging import prometheusLogger
|
||||||
|
|
||||||
|
if prometheusLogger is not None:
|
||||||
|
prometheusLogger.set_llm_outage_metric(
|
||||||
|
litellm_model_name=_model_name,
|
||||||
|
model_id=model_id,
|
||||||
|
api_base="",
|
||||||
|
api_provider="",
|
||||||
|
)
|
||||||
|
pass
|
Loading…
Add table
Add a link
Reference in a new issue