diff --git a/litellm/router_utils/cooldown_handlers.py b/litellm/router_utils/cooldown_handlers.py index 8f5c3895a6..64a6d56a77 100644 --- a/litellm/router_utils/cooldown_handlers.py +++ b/litellm/router_utils/cooldown_handlers.py @@ -216,7 +216,7 @@ def _should_cooldown_deployment( return True return False - elif not is_single_deployment_model_group: + else: return should_cooldown_based_on_allowed_fails_policy( litellm_router_instance=litellm_router_instance, deployment=deployment, diff --git a/tests/local_testing/test_router_cooldowns.py b/tests/local_testing/test_router_cooldowns.py index 8c907af297..38d4133afd 100644 --- a/tests/local_testing/test_router_cooldowns.py +++ b/tests/local_testing/test_router_cooldowns.py @@ -12,7 +12,7 @@ import pytest sys.path.insert( 0, os.path.abspath("../..") -) # Adds the parent directory to the system path +) # Adds the parent directory to the system-path from unittest.mock import AsyncMock, MagicMock, patch @@ -23,7 +23,11 @@ import litellm from litellm import Router from litellm.integrations.custom_logger import CustomLogger from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments -from litellm.types.router import DeploymentTypedDict, LiteLLMParamsTypedDict +from litellm.types.router import ( + DeploymentTypedDict, + LiteLLMParamsTypedDict, + AllowedFailsPolicy, +) @pytest.mark.asyncio @@ -134,7 +138,7 @@ def test_single_deployment_no_cooldowns(num_deployments): ) model_list.append(model) - router = Router(model_list=model_list, allowed_fails=0, num_retries=0) + router = Router(model_list=model_list, num_retries=0) with patch.object( router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock() @@ -181,7 +185,6 @@ async def test_single_deployment_no_cooldowns_test_prod(): }, }, ], - allowed_fails=0, num_retries=0, ) @@ -202,6 +205,104 @@ async def test_single_deployment_no_cooldowns_test_prod(): mock_client.assert_not_called() +@pytest.mark.asyncio +async def test_single_deployment_cooldown_with_allowed_fails(): + """ + When `allowed_fails` is set, use the allowed_fails to determine cooldown for 1 deployment + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + }, + }, + { + "model_name": "gpt-5", + "litellm_params": { + "model": "openai/gpt-5", + }, + }, + { + "model_name": "gpt-12", + "litellm_params": { + "model": "openai/gpt-12", + }, + }, + ], + allowed_fails=1, + num_retries=0, + ) + + with patch.object( + router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock() + ) as mock_client: + for _ in range(2): + try: + await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + timeout=0.0001, + ) + except litellm.Timeout: + pass + + await asyncio.sleep(2) + + mock_client.assert_called_once() + + +@pytest.mark.asyncio +async def test_single_deployment_cooldown_with_allowed_fail_policy(): + """ + When `allowed_fails_policy` is set, use the allowed_fails_policy to determine cooldown for 1 deployment + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + }, + }, + { + "model_name": "gpt-5", + "litellm_params": { + "model": "openai/gpt-5", + }, + }, + { + "model_name": "gpt-12", + "litellm_params": { + "model": "openai/gpt-12", + }, + }, + ], + allowed_fails_policy=AllowedFailsPolicy( + TimeoutErrorAllowedFails=1, + ), + num_retries=0, + ) + + with patch.object( + router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock() + ) as mock_client: + for _ in range(2): + try: + await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + timeout=0.0001, + ) + except litellm.Timeout: + pass + + await asyncio.sleep(2) + + mock_client.assert_called_once() + + @pytest.mark.asyncio async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls(): """