mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
Merge branch 'main' into litellm_fix_httpx_transport
This commit is contained in:
commit
d38f01e956
189 changed files with 8377 additions and 1087 deletions
|
@ -106,7 +106,9 @@ class Router:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
model_list: Optional[List[Union[DeploymentTypedDict, Dict]]] = None,
|
||||
model_list: Optional[
|
||||
Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
|
||||
] = None,
|
||||
## ASSISTANTS API ##
|
||||
assistants_config: Optional[AssistantsTypedDict] = None,
|
||||
## CACHING ##
|
||||
|
@ -155,6 +157,7 @@ class Router:
|
|||
cooldown_time: Optional[
|
||||
float
|
||||
] = None, # (seconds) time to cooldown a deployment after failure
|
||||
disable_cooldowns: Optional[bool] = None,
|
||||
routing_strategy: Literal[
|
||||
"simple-shuffle",
|
||||
"least-busy",
|
||||
|
@ -306,6 +309,7 @@ class Router:
|
|||
|
||||
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
||||
self.cooldown_time = cooldown_time or 60
|
||||
self.disable_cooldowns = disable_cooldowns
|
||||
self.failed_calls = (
|
||||
InMemoryCache()
|
||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||
|
@ -2989,6 +2993,8 @@ class Router:
|
|||
|
||||
the exception is not one that should be immediately retried (e.g. 401)
|
||||
"""
|
||||
if self.disable_cooldowns is True:
|
||||
return
|
||||
|
||||
if deployment is None:
|
||||
return
|
||||
|
@ -3029,24 +3035,50 @@ class Router:
|
|||
exception_status = 500
|
||||
_should_retry = litellm._should_retry(status_code=exception_status)
|
||||
|
||||
if updated_fails > allowed_fails or _should_retry == False:
|
||||
if updated_fails > allowed_fails or _should_retry is False:
|
||||
# get the current cooldown list for that minute
|
||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||
cached_value = self.cache.get_cache(key=cooldown_key)
|
||||
cached_value = self.cache.get_cache(
|
||||
key=cooldown_key
|
||||
) # [(deployment_id, {last_error_str, last_error_status_code})]
|
||||
|
||||
cached_value_deployment_ids = []
|
||||
if (
|
||||
cached_value is not None
|
||||
and isinstance(cached_value, list)
|
||||
and len(cached_value) > 0
|
||||
and isinstance(cached_value[0], tuple)
|
||||
):
|
||||
cached_value_deployment_ids = [cv[0] for cv in cached_value]
|
||||
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
|
||||
# update value
|
||||
try:
|
||||
if deployment in cached_value:
|
||||
if cached_value is not None and len(cached_value_deployment_ids) > 0:
|
||||
if deployment in cached_value_deployment_ids:
|
||||
pass
|
||||
else:
|
||||
cached_value = cached_value + [deployment]
|
||||
cached_value = cached_value + [
|
||||
(
|
||||
deployment,
|
||||
{
|
||||
"Exception Received": str(original_exception),
|
||||
"Status Code": str(exception_status),
|
||||
},
|
||||
)
|
||||
]
|
||||
# save updated value
|
||||
self.cache.set_cache(
|
||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||
)
|
||||
except:
|
||||
cached_value = [deployment]
|
||||
else:
|
||||
cached_value = [
|
||||
(
|
||||
deployment,
|
||||
{
|
||||
"Exception Received": str(original_exception),
|
||||
"Status Code": str(exception_status),
|
||||
},
|
||||
)
|
||||
]
|
||||
# save updated value
|
||||
self.cache.set_cache(
|
||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||
|
@ -3062,7 +3094,33 @@ class Router:
|
|||
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||
)
|
||||
|
||||
async def _async_get_cooldown_deployments(self):
|
||||
async def _async_get_cooldown_deployments(self) -> List[str]:
|
||||
"""
|
||||
Async implementation of '_get_cooldown_deployments'
|
||||
"""
|
||||
dt = get_utc_datetime()
|
||||
current_minute = dt.strftime("%H-%M")
|
||||
# get the current cooldown list for that minute
|
||||
cooldown_key = f"{current_minute}:cooldown_models"
|
||||
|
||||
# ----------------------
|
||||
# Return cooldown models
|
||||
# ----------------------
|
||||
cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
|
||||
|
||||
cached_value_deployment_ids = []
|
||||
if (
|
||||
cooldown_models is not None
|
||||
and isinstance(cooldown_models, list)
|
||||
and len(cooldown_models) > 0
|
||||
and isinstance(cooldown_models[0], tuple)
|
||||
):
|
||||
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||
|
||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||
return cached_value_deployment_ids
|
||||
|
||||
async def _async_get_cooldown_deployments_with_debug_info(self) -> List[tuple]:
|
||||
"""
|
||||
Async implementation of '_get_cooldown_deployments'
|
||||
"""
|
||||
|
@ -3079,7 +3137,7 @@ class Router:
|
|||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||
return cooldown_models
|
||||
|
||||
def _get_cooldown_deployments(self):
|
||||
def _get_cooldown_deployments(self) -> List[str]:
|
||||
"""
|
||||
Get the list of models being cooled down for this minute
|
||||
"""
|
||||
|
@ -3093,8 +3151,17 @@ class Router:
|
|||
# ----------------------
|
||||
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
|
||||
|
||||
cached_value_deployment_ids = []
|
||||
if (
|
||||
cooldown_models is not None
|
||||
and isinstance(cooldown_models, list)
|
||||
and len(cooldown_models) > 0
|
||||
and isinstance(cooldown_models[0], tuple)
|
||||
):
|
||||
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||
|
||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||
return cooldown_models
|
||||
return cached_value_deployment_ids
|
||||
|
||||
def _get_healthy_deployments(self, model: str):
|
||||
_all_deployments: list = []
|
||||
|
@ -3969,16 +4036,36 @@ class Router:
|
|||
|
||||
Augment litellm info with additional params set in `model_info`.
|
||||
|
||||
For azure models, ignore the `model:`. Only set max tokens, cost values if base_model is set.
|
||||
|
||||
Returns
|
||||
- ModelInfo - If found -> typed dict with max tokens, input cost, etc.
|
||||
|
||||
Raises:
|
||||
- ValueError -> If model is not mapped yet
|
||||
"""
|
||||
## SET MODEL NAME
|
||||
## GET BASE MODEL
|
||||
base_model = deployment.get("model_info", {}).get("base_model", None)
|
||||
if base_model is None:
|
||||
base_model = deployment.get("litellm_params", {}).get("base_model", None)
|
||||
model = base_model or deployment.get("litellm_params", {}).get("model", None)
|
||||
|
||||
## GET LITELLM MODEL INFO
|
||||
model = base_model
|
||||
|
||||
## GET PROVIDER
|
||||
_model, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
||||
model=deployment.get("litellm_params", {}).get("model", ""),
|
||||
litellm_params=LiteLLM_Params(**deployment.get("litellm_params", {})),
|
||||
)
|
||||
|
||||
## SET MODEL TO 'model=' - if base_model is None + not azure
|
||||
if custom_llm_provider == "azure" and base_model is None:
|
||||
verbose_router_logger.error(
|
||||
"Could not identify azure model. Set azure 'base_model' for accurate max tokens, cost tracking, etc.- https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models"
|
||||
)
|
||||
elif custom_llm_provider != "azure":
|
||||
model = _model
|
||||
|
||||
## GET LITELLM MODEL INFO - raises exception, if model is not mapped
|
||||
model_info = litellm.get_model_info(model=model)
|
||||
|
||||
## CHECK USER SET MODEL INFO
|
||||
|
@ -4364,7 +4451,7 @@ class Router:
|
|||
"""
|
||||
Filter out model in model group, if:
|
||||
|
||||
- model context window < message length
|
||||
- model context window < message length. For azure openai models, requires 'base_model' is set. - https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models
|
||||
- filter models above rpm limits
|
||||
- if region given, filter out models not in that region / unknown region
|
||||
- [TODO] function call and model doesn't support function calling
|
||||
|
@ -4381,6 +4468,11 @@ class Router:
|
|||
try:
|
||||
input_tokens = litellm.token_counter(messages=messages)
|
||||
except Exception as e:
|
||||
verbose_router_logger.error(
|
||||
"litellm.router.py::_pre_call_checks: failed to count tokens. Returning initial list of deployments. Got - {}".format(
|
||||
str(e)
|
||||
)
|
||||
)
|
||||
return _returned_deployments
|
||||
|
||||
_context_window_error = False
|
||||
|
@ -4424,7 +4516,7 @@ class Router:
|
|||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
verbose_router_logger.debug("An error occurs - {}".format(str(e)))
|
||||
verbose_router_logger.error("An error occurs - {}".format(str(e)))
|
||||
|
||||
_litellm_params = deployment.get("litellm_params", {})
|
||||
model_id = deployment.get("model_info", {}).get("id", "")
|
||||
|
@ -4685,7 +4777,7 @@ class Router:
|
|||
if _allowed_model_region is None:
|
||||
_allowed_model_region = "n/a"
|
||||
raise ValueError(
|
||||
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}"
|
||||
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}, cooldown_list={await self._async_get_cooldown_deployments_with_debug_info()}"
|
||||
)
|
||||
|
||||
if (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue