[Fix-Router] Don't cooldown when only 1 deployment exists (#5673)

* fix get model list

* fix test custom callback router

* fix embedding fallback test

* fix router retry policy on AuthErrors

* fix router test

* add test for single deployments no cooldown test prod

* add test test_single_deployment_no_cooldowns_test_prod_mock_completion_calls
This commit is contained in:
Ishaan Jaff 2024-09-12 19:14:58 -07:00 committed by GitHub
parent 40c52f9263
commit e7c22f63e7
4 changed files with 128 additions and 17 deletions

View file

@ -150,3 +150,100 @@ def test_single_deployment_no_cooldowns(num_deployments):
mock_client.assert_not_called()
else:
mock_client.assert_called_once()
@pytest.mark.asyncio
async def test_single_deployment_no_cooldowns_test_prod():
"""
Do not cooldown on single deployment.
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
{
"model_name": "gpt-5",
"litellm_params": {
"model": "openai/gpt-5",
},
},
{
"model_name": "gpt-12",
"litellm_params": {
"model": "openai/gpt-12",
},
},
],
allowed_fails=0,
num_retries=0,
)
with patch.object(
router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
) as mock_client:
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response="litellm.RateLimitError",
)
except litellm.RateLimitError:
pass
await asyncio.sleep(2)
mock_client.assert_not_called()
@pytest.mark.asyncio
async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls():
"""
Do not cooldown on single deployment.
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
{
"model_name": "gpt-5",
"litellm_params": {
"model": "openai/gpt-5",
},
},
{
"model_name": "gpt-12",
"litellm_params": {
"model": "openai/gpt-12",
},
},
],
)
for _ in range(20):
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response="litellm.RateLimitError",
)
except litellm.RateLimitError:
pass
cooldown_list = await router._async_get_cooldown_deployments()
assert len(cooldown_list) == 0
healthy_deployments, _ = await router._async_get_healthy_deployments(
model="gpt-3.5-turbo"
)
print("healthy_deployments: ", healthy_deployments)