forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_standardize_slack_exception_msg_format
This commit is contained in:
commit
8413fdf4c7
4 changed files with 81 additions and 30 deletions
|
@ -864,6 +864,7 @@ Model Info:
|
||||||
|
|
||||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
"""Log deployment latency"""
|
"""Log deployment latency"""
|
||||||
|
try:
|
||||||
if "daily_reports" in self.alert_types:
|
if "daily_reports" in self.alert_types:
|
||||||
model_id = (
|
model_id = (
|
||||||
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
|
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
|
||||||
|
@ -875,7 +876,10 @@ Model Info:
|
||||||
|
|
||||||
if isinstance(response_obj, litellm.ModelResponse):
|
if isinstance(response_obj, litellm.ModelResponse):
|
||||||
completion_tokens = response_obj.usage.completion_tokens
|
completion_tokens = response_obj.usage.completion_tokens
|
||||||
final_value = float(response_s.total_seconds() / completion_tokens)
|
if completion_tokens is not None and completion_tokens > 0:
|
||||||
|
final_value = float(
|
||||||
|
response_s.total_seconds() / completion_tokens
|
||||||
|
)
|
||||||
|
|
||||||
await self.async_update_daily_reports(
|
await self.async_update_daily_reports(
|
||||||
DeploymentMetrics(
|
DeploymentMetrics(
|
||||||
|
@ -885,6 +889,12 @@ Model Info:
|
||||||
updated_at=litellm.utils.get_utc_datetime(),
|
updated_at=litellm.utils.get_utc_datetime(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
verbose_proxy_logger.error(
|
||||||
|
"[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
pass
|
||||||
|
|
||||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
"""Log failure + deployment latency"""
|
"""Log failure + deployment latency"""
|
||||||
|
|
|
@ -131,7 +131,13 @@ class ProxyLogging:
|
||||||
alerting_args=alerting_args,
|
alerting_args=alerting_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
if "daily_reports" in self.alert_types:
|
if (
|
||||||
|
self.alerting is not None
|
||||||
|
and "slack" in self.alerting
|
||||||
|
and "daily_reports" in self.alert_types
|
||||||
|
):
|
||||||
|
# NOTE: ENSURE we only add callbacks when alerting is on
|
||||||
|
# We should NOT add callbacks when alerting is off
|
||||||
litellm.callbacks.append(self.slack_alerting_instance) # type: ignore
|
litellm.callbacks.append(self.slack_alerting_instance) # type: ignore
|
||||||
|
|
||||||
if redis_cache is not None:
|
if redis_cache is not None:
|
||||||
|
|
|
@ -1923,10 +1923,28 @@ class Router:
|
||||||
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
||||||
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
||||||
|
|
||||||
|
exception_response = getattr(exception, "response", {})
|
||||||
|
exception_headers = getattr(exception_response, "headers", None)
|
||||||
|
_time_to_cooldown = self.cooldown_time
|
||||||
|
|
||||||
|
if exception_headers is not None:
|
||||||
|
|
||||||
|
_time_to_cooldown = (
|
||||||
|
litellm.utils._get_retry_after_from_exception_header(
|
||||||
|
response_headers=exception_headers
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if _time_to_cooldown < 0:
|
||||||
|
# if the response headers did not read it -> set to default cooldown time
|
||||||
|
_time_to_cooldown = self.cooldown_time
|
||||||
|
|
||||||
if isinstance(_model_info, dict):
|
if isinstance(_model_info, dict):
|
||||||
deployment_id = _model_info.get("id", None)
|
deployment_id = _model_info.get("id", None)
|
||||||
self._set_cooldown_deployments(
|
self._set_cooldown_deployments(
|
||||||
exception_status=exception_status, deployment=deployment_id
|
exception_status=exception_status,
|
||||||
|
deployment=deployment_id,
|
||||||
|
time_to_cooldown=_time_to_cooldown,
|
||||||
) # setting deployment_id in cooldown deployments
|
) # setting deployment_id in cooldown deployments
|
||||||
if custom_llm_provider:
|
if custom_llm_provider:
|
||||||
model_name = f"{custom_llm_provider}/{model_name}"
|
model_name = f"{custom_llm_provider}/{model_name}"
|
||||||
|
@ -2026,7 +2044,10 @@ class Router:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _set_cooldown_deployments(
|
def _set_cooldown_deployments(
|
||||||
self, exception_status: Union[str, int], deployment: Optional[str] = None
|
self,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
deployment: Optional[str] = None,
|
||||||
|
time_to_cooldown: Optional[float] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
||||||
|
@ -2053,6 +2074,8 @@ class Router:
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
||||||
)
|
)
|
||||||
cooldown_time = self.cooldown_time or 1
|
cooldown_time = self.cooldown_time or 1
|
||||||
|
if time_to_cooldown is not None:
|
||||||
|
cooldown_time = time_to_cooldown
|
||||||
|
|
||||||
if isinstance(exception_status, str):
|
if isinstance(exception_status, str):
|
||||||
try:
|
try:
|
||||||
|
@ -2090,7 +2113,9 @@ class Router:
|
||||||
)
|
)
|
||||||
|
|
||||||
self.send_deployment_cooldown_alert(
|
self.send_deployment_cooldown_alert(
|
||||||
deployment_id=deployment, exception_status=exception_status
|
deployment_id=deployment,
|
||||||
|
exception_status=exception_status,
|
||||||
|
cooldown_time=cooldown_time,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.failed_calls.set_cache(
|
self.failed_calls.set_cache(
|
||||||
|
@ -3751,7 +3776,10 @@ class Router:
|
||||||
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
||||||
|
|
||||||
def send_deployment_cooldown_alert(
|
def send_deployment_cooldown_alert(
|
||||||
self, deployment_id: str, exception_status: Union[str, int]
|
self,
|
||||||
|
deployment_id: str,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
cooldown_time: float,
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
from litellm.proxy.proxy_server import proxy_logging_obj
|
from litellm.proxy.proxy_server import proxy_logging_obj
|
||||||
|
@ -3775,7 +3803,7 @@ class Router:
|
||||||
)
|
)
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
proxy_logging_obj.slack_alerting_instance.send_alert(
|
proxy_logging_obj.slack_alerting_instance.send_alert(
|
||||||
message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{self.cooldown_time}` seconds\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
|
message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
|
||||||
alert_type="cooldown_deployment",
|
alert_type="cooldown_deployment",
|
||||||
level="Low",
|
level="Low",
|
||||||
)
|
)
|
||||||
|
|
|
@ -8071,11 +8071,8 @@ def _should_retry(status_code: int):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _calculate_retry_after(
|
def _get_retry_after_from_exception_header(
|
||||||
remaining_retries: int,
|
|
||||||
max_retries: int,
|
|
||||||
response_headers: Optional[httpx.Headers] = None,
|
response_headers: Optional[httpx.Headers] = None,
|
||||||
min_timeout: int = 0,
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Reimplementation of openai's calculate retry after, since that one can't be imported.
|
Reimplementation of openai's calculate retry after, since that one can't be imported.
|
||||||
|
@ -8101,10 +8098,20 @@ def _calculate_retry_after(
|
||||||
retry_after = int(retry_date - time.time())
|
retry_after = int(retry_date - time.time())
|
||||||
else:
|
else:
|
||||||
retry_after = -1
|
retry_after = -1
|
||||||
|
return retry_after
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
retry_after = -1
|
retry_after = -1
|
||||||
|
|
||||||
|
|
||||||
|
def _calculate_retry_after(
|
||||||
|
remaining_retries: int,
|
||||||
|
max_retries: int,
|
||||||
|
response_headers: Optional[httpx.Headers] = None,
|
||||||
|
min_timeout: int = 0,
|
||||||
|
):
|
||||||
|
retry_after = _get_retry_after_from_exception_header(response_headers)
|
||||||
|
|
||||||
# If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
|
# If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
|
||||||
if 0 < retry_after <= 60:
|
if 0 < retry_after <= 60:
|
||||||
return retry_after
|
return retry_after
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue