[Fix-Router] Don't cooldown when only 1 deployment exists (#5673)

* fix get model list * fix test custom callback router * fix embedding fallback test * fix router retry policy on AuthErrors * fix router test * add test for single deployments no cooldown test prod * add test test_single_deployment_no_cooldowns_test_prod_mock_completion_calls
2025-04-25 18:54:30 +00:00 · 2024-09-12 19:14:58 -07:00 · 2024-09-12 19:14:58 -07:00 · 19a06d7842
commit 19a06d7842
parent 13ba22d6fd
4 changed files with 128 additions and 17 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1130,7 +1130,7 @@ class Router:
        make_request = False

        while curr_time < end_time:
-            _healthy_deployments = await self._async_get_healthy_deployments(
+            _healthy_deployments, _ = await self._async_get_healthy_deployments(
                model=model
            )
            make_request = await self.scheduler.poll(  ## POLL QUEUE ## - returns 'True' if there's healthy deployments OR if request is at top of queue
@ -3060,14 +3060,17 @@ class Router:
            Retry Logic
             
            """
-            _healthy_deployments = await self._async_get_healthy_deployments(
-                model=kwargs.get("model") or "",
+            _healthy_deployments, _all_deployments = (
+                await self._async_get_healthy_deployments(
+                    model=kwargs.get("model") or "",
+                )
            )

            # raises an exception if this error should not be retries
            self.should_retry_this_error(
                error=e,
                healthy_deployments=_healthy_deployments,
+                all_deployments=_all_deployments,
                context_window_fallbacks=context_window_fallbacks,
                regular_fallbacks=fallbacks,
                content_policy_fallbacks=content_policy_fallbacks,
@ -3114,7 +3117,7 @@ class Router:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
                    remaining_retries = num_retries - current_attempt
-                    _healthy_deployments = await self._async_get_healthy_deployments(
+                    _healthy_deployments, _ = await self._async_get_healthy_deployments(
                        model=kwargs.get("model"),
                    )
                    _timeout = self._time_to_sleep_before_retry(
@ -3135,6 +3138,7 @@ class Router:
        self,
        error: Exception,
        healthy_deployments: Optional[List] = None,
+        all_deployments: Optional[List] = None,
        context_window_fallbacks: Optional[List] = None,
        content_policy_fallbacks: Optional[List] = None,
        regular_fallbacks: Optional[List] = None,
@ -3150,6 +3154,9 @@ class Router:
        _num_healthy_deployments = 0
        if healthy_deployments is not None and isinstance(healthy_deployments, list):
            _num_healthy_deployments = len(healthy_deployments)
+        _num_all_deployments = 0
+        if all_deployments is not None and isinstance(all_deployments, list):
+            _num_all_deployments = len(all_deployments)

        ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR / CONTENT POLICY VIOLATION ERROR w/ fallbacks available / Bad Request Error
        if (
@ -3180,7 +3187,9 @@ class Router:
            - if other deployments available -> retry
            - else -> raise error
            """
-            if _num_healthy_deployments <= 0:  # if no healthy deployments
+            if (
+                _num_all_deployments <= 1
+            ):  # if there is only 1 deployment for this model group then don't retry
                raise error  # then raise error

        # Do not retry if there are no healthy deployments
@ -3390,7 +3399,7 @@ class Router:
            current_attempt = None
            original_exception = e
            ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
-            _healthy_deployments = self._get_healthy_deployments(
+            _healthy_deployments, _all_deployments = self._get_healthy_deployments(
                model=kwargs.get("model"),
            )

@ -3398,6 +3407,7 @@ class Router:
            self.should_retry_this_error(
                error=e,
                healthy_deployments=_healthy_deployments,
+                all_deployments=_all_deployments,
                context_window_fallbacks=context_window_fallbacks,
                regular_fallbacks=fallbacks,
                content_policy_fallbacks=content_policy_fallbacks,
@ -3428,7 +3438,7 @@ class Router:
                except Exception as e:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
-                    _healthy_deployments = self._get_healthy_deployments(
+                    _healthy_deployments, _ = self._get_healthy_deployments(
                        model=kwargs.get("model"),
                    )
                    remaining_retries = num_retries - current_attempt
@ -3881,7 +3891,7 @@ class Router:
            else:
                healthy_deployments.append(deployment)

-        return healthy_deployments
+        return healthy_deployments, _all_deployments

    async def _async_get_healthy_deployments(self, model: str):
        _all_deployments: list = []
@ -3901,7 +3911,7 @@ class Router:
                continue
            else:
                healthy_deployments.append(deployment)
-        return healthy_deployments
+        return healthy_deployments, _all_deployments

    def routing_strategy_pre_call_checks(self, deployment: dict):
        """
@ -4679,10 +4689,7 @@ class Router:
                returned_models += self.model_list

                return returned_models
-
-            for model in self.model_list:
-                returned_models.extend(self._get_all_deployments(model_name=model_name))
-
+            returned_models.extend(self._get_all_deployments(model_name=model_name))
            return returned_models
        return None