Merge branch 'main' into litellm_fix_httpx_transport

2025-04-25 10:44:24 +00:00 · 2024-07-02 17:17:43 -07:00 · 2024-07-02 17:17:43 -07:00 · d38f01e956
commit d38f01e956
parent 98daedaf60 f87535140c
189 changed files with 8377 additions and 1087 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -106,7 +106,9 @@ class Router:

    def __init__(
        self,
-        model_list: Optional[List[Union[DeploymentTypedDict, Dict]]] = None,
+        model_list: Optional[
+            Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
+        ] = None,
        ## ASSISTANTS API ##
        assistants_config: Optional[AssistantsTypedDict] = None,
        ## CACHING ##
@ -155,6 +157,7 @@ class Router:
        cooldown_time: Optional[
            float
        ] = None,  # (seconds) time to cooldown a deployment after failure
+        disable_cooldowns: Optional[bool] = None,
        routing_strategy: Literal[
            "simple-shuffle",
            "least-busy",
@ -306,6 +309,7 @@ class Router:

        self.allowed_fails = allowed_fails or litellm.allowed_fails
        self.cooldown_time = cooldown_time or 60
+        self.disable_cooldowns = disable_cooldowns
        self.failed_calls = (
            InMemoryCache()
        )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -2989,6 +2993,8 @@ class Router:

        the exception is not one that should be immediately retried (e.g. 401)
        """
+        if self.disable_cooldowns is True:
+            return

        if deployment is None:
            return
@ -3029,24 +3035,50 @@ class Router:
                exception_status = 500
        _should_retry = litellm._should_retry(status_code=exception_status)

-        if updated_fails > allowed_fails or _should_retry == False:
+        if updated_fails > allowed_fails or _should_retry is False:
            # get the current cooldown list for that minute
            cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls
-            cached_value = self.cache.get_cache(key=cooldown_key)
+            cached_value = self.cache.get_cache(
+                key=cooldown_key
+            )  # [(deployment_id, {last_error_str, last_error_status_code})]

+            cached_value_deployment_ids = []
+            if (
+                cached_value is not None
+                and isinstance(cached_value, list)
+                and len(cached_value) > 0
+                and isinstance(cached_value[0], tuple)
+            ):
+                cached_value_deployment_ids = [cv[0] for cv in cached_value]
            verbose_router_logger.debug(f"adding {deployment} to cooldown models")
            # update value
-            try:
-                if deployment in cached_value:
+            if cached_value is not None and len(cached_value_deployment_ids) > 0:
+                if deployment in cached_value_deployment_ids:
                    pass
                else:
-                    cached_value = cached_value + [deployment]
+                    cached_value = cached_value + [
+                        (
+                            deployment,
+                            {
+                                "Exception Received": str(original_exception),
+                                "Status Code": str(exception_status),
+                            },
+                        )
+                    ]
                    # save updated value
                    self.cache.set_cache(
                        value=cached_value, key=cooldown_key, ttl=cooldown_time
                    )
-            except:
-                cached_value = [deployment]
+            else:
+                cached_value = [
+                    (
+                        deployment,
+                        {
+                            "Exception Received": str(original_exception),
+                            "Status Code": str(exception_status),
+                        },
+                    )
+                ]
                # save updated value
                self.cache.set_cache(
                    value=cached_value, key=cooldown_key, ttl=cooldown_time
@ -3062,7 +3094,33 @@ class Router:
                key=deployment, value=updated_fails, ttl=cooldown_time
            )

-    async def _async_get_cooldown_deployments(self):
+    async def _async_get_cooldown_deployments(self) -> List[str]:
+        """
+        Async implementation of '_get_cooldown_deployments'
+        """
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        # get the current cooldown list for that minute
+        cooldown_key = f"{current_minute}:cooldown_models"
+
+        # ----------------------
+        # Return cooldown models
+        # ----------------------
+        cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
+
+        cached_value_deployment_ids = []
+        if (
+            cooldown_models is not None
+            and isinstance(cooldown_models, list)
+            and len(cooldown_models) > 0
+            and isinstance(cooldown_models[0], tuple)
+        ):
+            cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
+        verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+        return cached_value_deployment_ids
+
+    async def _async_get_cooldown_deployments_with_debug_info(self) -> List[tuple]:
        """
        Async implementation of '_get_cooldown_deployments'
        """
@ -3079,7 +3137,7 @@ class Router:
        verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
        return cooldown_models

-    def _get_cooldown_deployments(self):
+    def _get_cooldown_deployments(self) -> List[str]:
        """
        Get the list of models being cooled down for this minute
        """
@ -3093,8 +3151,17 @@ class Router:
        # ----------------------
        cooldown_models = self.cache.get_cache(key=cooldown_key) or []

+        cached_value_deployment_ids = []
+        if (
+            cooldown_models is not None
+            and isinstance(cooldown_models, list)
+            and len(cooldown_models) > 0
+            and isinstance(cooldown_models[0], tuple)
+        ):
+            cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
        verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
-        return cooldown_models
+        return cached_value_deployment_ids

    def _get_healthy_deployments(self, model: str):
        _all_deployments: list = []
@ -3969,16 +4036,36 @@ class Router:

        Augment litellm info with additional params set in `model_info`.

+        For azure models, ignore the `model:`. Only set max tokens, cost values if base_model is set.
+
        Returns
        - ModelInfo - If found -> typed dict with max tokens, input cost, etc.
+
+        Raises:
+        - ValueError -> If model is not mapped yet
        """
-        ## SET MODEL NAME
+        ## GET BASE MODEL
        base_model = deployment.get("model_info", {}).get("base_model", None)
        if base_model is None:
            base_model = deployment.get("litellm_params", {}).get("base_model", None)
-        model = base_model or deployment.get("litellm_params", {}).get("model", None)

-        ## GET LITELLM MODEL INFO
+        model = base_model
+
+        ## GET PROVIDER
+        _model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+            model=deployment.get("litellm_params", {}).get("model", ""),
+            litellm_params=LiteLLM_Params(**deployment.get("litellm_params", {})),
+        )
+
+        ## SET MODEL TO 'model=' - if base_model is None + not azure
+        if custom_llm_provider == "azure" and base_model is None:
+            verbose_router_logger.error(
+                "Could not identify azure model. Set azure 'base_model' for accurate max tokens, cost tracking, etc.- https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models"
+            )
+        elif custom_llm_provider != "azure":
+            model = _model
+
+        ## GET LITELLM MODEL INFO - raises exception, if model is not mapped
        model_info = litellm.get_model_info(model=model)

        ## CHECK USER SET MODEL INFO
@ -4364,7 +4451,7 @@ class Router:
        """
        Filter out model in model group, if:

-        - model context window < message length
+        - model context window < message length. For azure openai models, requires 'base_model' is set. - https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models
        - filter models above rpm limits
        - if region given, filter out models not in that region / unknown region
        - [TODO] function call and model doesn't support function calling
@ -4381,6 +4468,11 @@ class Router:
        try:
            input_tokens = litellm.token_counter(messages=messages)
        except Exception as e:
+            verbose_router_logger.error(
+                "litellm.router.py::_pre_call_checks: failed to count tokens. Returning initial list of deployments. Got - {}".format(
+                    str(e)
+                )
+            )
            return _returned_deployments

        _context_window_error = False
@ -4424,7 +4516,7 @@ class Router:
                        )
                        continue
            except Exception as e:
-                verbose_router_logger.debug("An error occurs - {}".format(str(e)))
+                verbose_router_logger.error("An error occurs - {}".format(str(e)))

            _litellm_params = deployment.get("litellm_params", {})
            model_id = deployment.get("model_info", {}).get("id", "")
@ -4685,7 +4777,7 @@ class Router:
                if _allowed_model_region is None:
                    _allowed_model_region = "n/a"
                raise ValueError(
-                    f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}"
+                    f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}, cooldown_list={await self._async_get_cooldown_deployments_with_debug_info()}"
                )

            if (