diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index a0d03bdf80..c1148fc75b 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -6,11 +6,11 @@ model_list: vertex_project: "adroit-crow-413218" vertex_location: "us-central1" vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" - - model_name: fake-openai-endpoint + - model_name: fake-azure-endpoint litellm_params: - model: openai/fake + model: openai/429 api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ + api_base: https://exampleopenaiendpoint-production.up.railway.app general_settings: master_key: sk-1234 diff --git a/litellm/router.py b/litellm/router.py index fcfc92fd08..8f603c5616 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1130,7 +1130,7 @@ class Router: make_request = False while curr_time < end_time: - _healthy_deployments = await self._async_get_healthy_deployments( + _healthy_deployments, _ = await self._async_get_healthy_deployments( model=model ) make_request = await self.scheduler.poll( ## POLL QUEUE ## - returns 'True' if there's healthy deployments OR if request is at top of queue @@ -3060,14 +3060,17 @@ class Router: Retry Logic """ - _healthy_deployments = await self._async_get_healthy_deployments( - model=kwargs.get("model") or "", + _healthy_deployments, _all_deployments = ( + await self._async_get_healthy_deployments( + model=kwargs.get("model") or "", + ) ) # raises an exception if this error should not be retries self.should_retry_this_error( error=e, healthy_deployments=_healthy_deployments, + all_deployments=_all_deployments, context_window_fallbacks=context_window_fallbacks, regular_fallbacks=fallbacks, content_policy_fallbacks=content_policy_fallbacks, @@ -3114,7 +3117,7 @@ class Router: ## LOGGING kwargs = self.log_retry(kwargs=kwargs, e=e) remaining_retries = num_retries - current_attempt - _healthy_deployments = await self._async_get_healthy_deployments( + _healthy_deployments, _ = await self._async_get_healthy_deployments( model=kwargs.get("model"), ) _timeout = self._time_to_sleep_before_retry( @@ -3135,6 +3138,7 @@ class Router: self, error: Exception, healthy_deployments: Optional[List] = None, + all_deployments: Optional[List] = None, context_window_fallbacks: Optional[List] = None, content_policy_fallbacks: Optional[List] = None, regular_fallbacks: Optional[List] = None, @@ -3150,6 +3154,9 @@ class Router: _num_healthy_deployments = 0 if healthy_deployments is not None and isinstance(healthy_deployments, list): _num_healthy_deployments = len(healthy_deployments) + _num_all_deployments = 0 + if all_deployments is not None and isinstance(all_deployments, list): + _num_all_deployments = len(all_deployments) ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR / CONTENT POLICY VIOLATION ERROR w/ fallbacks available / Bad Request Error if ( @@ -3180,7 +3187,9 @@ class Router: - if other deployments available -> retry - else -> raise error """ - if _num_healthy_deployments <= 0: # if no healthy deployments + if ( + _num_all_deployments <= 1 + ): # if there is only 1 deployment for this model group then don't retry raise error # then raise error # Do not retry if there are no healthy deployments @@ -3390,7 +3399,7 @@ class Router: current_attempt = None original_exception = e ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR - _healthy_deployments = self._get_healthy_deployments( + _healthy_deployments, _all_deployments = self._get_healthy_deployments( model=kwargs.get("model"), ) @@ -3398,6 +3407,7 @@ class Router: self.should_retry_this_error( error=e, healthy_deployments=_healthy_deployments, + all_deployments=_all_deployments, context_window_fallbacks=context_window_fallbacks, regular_fallbacks=fallbacks, content_policy_fallbacks=content_policy_fallbacks, @@ -3428,7 +3438,7 @@ class Router: except Exception as e: ## LOGGING kwargs = self.log_retry(kwargs=kwargs, e=e) - _healthy_deployments = self._get_healthy_deployments( + _healthy_deployments, _ = self._get_healthy_deployments( model=kwargs.get("model"), ) remaining_retries = num_retries - current_attempt @@ -3881,7 +3891,7 @@ class Router: else: healthy_deployments.append(deployment) - return healthy_deployments + return healthy_deployments, _all_deployments async def _async_get_healthy_deployments(self, model: str): _all_deployments: list = [] @@ -3901,7 +3911,7 @@ class Router: continue else: healthy_deployments.append(deployment) - return healthy_deployments + return healthy_deployments, _all_deployments def routing_strategy_pre_call_checks(self, deployment: dict): """ @@ -4679,10 +4689,7 @@ class Router: returned_models += self.model_list return returned_models - - for model in self.model_list: - returned_models.extend(self._get_all_deployments(model_name=model_name)) - + returned_models.extend(self._get_all_deployments(model_name=model_name)) return returned_models return None diff --git a/litellm/tests/test_custom_callback_router.py b/litellm/tests/test_custom_callback_router.py index 6ffa97d89e..2dba6df235 100644 --- a/litellm/tests/test_custom_callback_router.py +++ b/litellm/tests/test_custom_callback_router.py @@ -533,6 +533,7 @@ async def test_async_chat_azure_with_fallbacks(): try: customHandler_fallbacks = CompletionCustomHandler() litellm.callbacks = [customHandler_fallbacks] + litellm.set_verbose = True # with fallbacks model_list = [ { @@ -555,7 +556,13 @@ async def test_async_chat_azure_with_fallbacks(): "rpm": 1800, }, ] - router = Router(model_list=model_list, fallbacks=[{"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}]) # type: ignore + router = Router( + model_list=model_list, + fallbacks=[{"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}], + retry_policy=litellm.router.RetryPolicy( + AuthenticationErrorRetries=0, + ), + ) # type: ignore response = await router.acompletion( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}], diff --git a/litellm/tests/test_router_cooldowns.py b/litellm/tests/test_router_cooldowns.py index ac92dfbf07..4287659a35 100644 --- a/litellm/tests/test_router_cooldowns.py +++ b/litellm/tests/test_router_cooldowns.py @@ -150,3 +150,100 @@ def test_single_deployment_no_cooldowns(num_deployments): mock_client.assert_not_called() else: mock_client.assert_called_once() + + +@pytest.mark.asyncio +async def test_single_deployment_no_cooldowns_test_prod(): + """ + Do not cooldown on single deployment. + + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + }, + }, + { + "model_name": "gpt-5", + "litellm_params": { + "model": "openai/gpt-5", + }, + }, + { + "model_name": "gpt-12", + "litellm_params": { + "model": "openai/gpt-12", + }, + }, + ], + allowed_fails=0, + num_retries=0, + ) + + with patch.object( + router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock() + ) as mock_client: + try: + await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response="litellm.RateLimitError", + ) + except litellm.RateLimitError: + pass + + await asyncio.sleep(2) + + mock_client.assert_not_called() + + +@pytest.mark.asyncio +async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls(): + """ + Do not cooldown on single deployment. + + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + }, + }, + { + "model_name": "gpt-5", + "litellm_params": { + "model": "openai/gpt-5", + }, + }, + { + "model_name": "gpt-12", + "litellm_params": { + "model": "openai/gpt-12", + }, + }, + ], + ) + + for _ in range(20): + try: + await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response="litellm.RateLimitError", + ) + except litellm.RateLimitError: + pass + + cooldown_list = await router._async_get_cooldown_deployments() + assert len(cooldown_list) == 0 + + healthy_deployments, _ = await router._async_get_healthy_deployments( + model="gpt-3.5-turbo" + ) + + print("healthy_deployments: ", healthy_deployments)