mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
* fix get model list * fix test custom callback router * fix embedding fallback test * fix router retry policy on AuthErrors * fix router test * add test for single deployments no cooldown test prod * add test test_single_deployment_no_cooldowns_test_prod_mock_completion_calls
249 lines
6.4 KiB
Python
249 lines
6.4 KiB
Python
#### What this tests ####
|
|
# This tests calling router with fallback models
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import time
|
|
import traceback
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(
|
|
0, os.path.abspath("../..")
|
|
) # Adds the parent directory to the system path
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import httpx
|
|
import openai
|
|
|
|
import litellm
|
|
from litellm import Router
|
|
from litellm.integrations.custom_logger import CustomLogger
|
|
from litellm.types.router import DeploymentTypedDict, LiteLLMParamsTypedDict
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cooldown_badrequest_error():
|
|
"""
|
|
Test 1. It SHOULD NOT cooldown a deployment on a BadRequestError
|
|
"""
|
|
|
|
router = litellm.Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "azure/chatgpt-v-2",
|
|
"api_key": os.getenv("AZURE_API_KEY"),
|
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
|
"api_base": os.getenv("AZURE_API_BASE"),
|
|
},
|
|
}
|
|
],
|
|
debug_level="DEBUG",
|
|
set_verbose=True,
|
|
cooldown_time=300,
|
|
num_retries=0,
|
|
allowed_fails=0,
|
|
)
|
|
|
|
# Act & Assert
|
|
try:
|
|
|
|
response = await router.acompletion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{"role": "user", "content": "gm"}],
|
|
bad_param=200,
|
|
)
|
|
except:
|
|
pass
|
|
|
|
await asyncio.sleep(3) # wait for deployment to get cooled-down
|
|
|
|
response = await router.acompletion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{"role": "user", "content": "gm"}],
|
|
mock_response="hello",
|
|
)
|
|
|
|
assert response is not None
|
|
|
|
print(response)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dynamic_cooldowns():
|
|
"""
|
|
Assert kwargs for completion/embedding have 'cooldown_time' as a litellm_param
|
|
"""
|
|
# litellm.set_verbose = True
|
|
tmp_mock = MagicMock()
|
|
|
|
litellm.failure_callback = [tmp_mock]
|
|
|
|
router = Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "my-fake-model",
|
|
"litellm_params": {
|
|
"model": "openai/gpt-1",
|
|
"api_key": "my-key",
|
|
"mock_response": Exception("this is an error"),
|
|
},
|
|
}
|
|
],
|
|
cooldown_time=60,
|
|
)
|
|
|
|
try:
|
|
_ = router.completion(
|
|
model="my-fake-model",
|
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
cooldown_time=0,
|
|
num_retries=0,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
tmp_mock.assert_called_once()
|
|
|
|
print(tmp_mock.call_count)
|
|
|
|
assert "cooldown_time" in tmp_mock.call_args[0][0]["litellm_params"]
|
|
assert tmp_mock.call_args[0][0]["litellm_params"]["cooldown_time"] == 0
|
|
|
|
|
|
@pytest.mark.parametrize("num_deployments", [1, 2])
|
|
def test_single_deployment_no_cooldowns(num_deployments):
|
|
"""
|
|
Do not cooldown on single deployment.
|
|
|
|
Cooldown on multiple deployments.
|
|
"""
|
|
model_list = []
|
|
for i in range(num_deployments):
|
|
model = DeploymentTypedDict(
|
|
model_name="gpt-3.5-turbo",
|
|
litellm_params=LiteLLMParamsTypedDict(
|
|
model="gpt-3.5-turbo",
|
|
),
|
|
)
|
|
model_list.append(model)
|
|
|
|
router = Router(model_list=model_list, allowed_fails=0, num_retries=0)
|
|
|
|
with patch.object(
|
|
router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
|
|
) as mock_client:
|
|
try:
|
|
router.completion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
mock_response="litellm.RateLimitError",
|
|
)
|
|
except litellm.RateLimitError:
|
|
pass
|
|
|
|
if num_deployments == 1:
|
|
mock_client.assert_not_called()
|
|
else:
|
|
mock_client.assert_called_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_single_deployment_no_cooldowns_test_prod():
|
|
"""
|
|
Do not cooldown on single deployment.
|
|
|
|
"""
|
|
router = Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "gpt-5",
|
|
"litellm_params": {
|
|
"model": "openai/gpt-5",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "gpt-12",
|
|
"litellm_params": {
|
|
"model": "openai/gpt-12",
|
|
},
|
|
},
|
|
],
|
|
allowed_fails=0,
|
|
num_retries=0,
|
|
)
|
|
|
|
with patch.object(
|
|
router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
|
|
) as mock_client:
|
|
try:
|
|
await router.acompletion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
mock_response="litellm.RateLimitError",
|
|
)
|
|
except litellm.RateLimitError:
|
|
pass
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
mock_client.assert_not_called()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls():
|
|
"""
|
|
Do not cooldown on single deployment.
|
|
|
|
"""
|
|
router = Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "gpt-5",
|
|
"litellm_params": {
|
|
"model": "openai/gpt-5",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "gpt-12",
|
|
"litellm_params": {
|
|
"model": "openai/gpt-12",
|
|
},
|
|
},
|
|
],
|
|
)
|
|
|
|
for _ in range(20):
|
|
try:
|
|
await router.acompletion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
mock_response="litellm.RateLimitError",
|
|
)
|
|
except litellm.RateLimitError:
|
|
pass
|
|
|
|
cooldown_list = await router._async_get_cooldown_deployments()
|
|
assert len(cooldown_list) == 0
|
|
|
|
healthy_deployments, _ = await router._async_get_healthy_deployments(
|
|
model="gpt-3.5-turbo"
|
|
)
|
|
|
|
print("healthy_deployments: ", healthy_deployments)
|