forked from phoenix/litellm-mirror
refactor: move all testing to top-level of repo
Closes https://github.com/BerriAI/litellm/issues/486
This commit is contained in:
parent
5403c5828c
commit
3560f0ef2c
213 changed files with 74 additions and 217 deletions
275
tests/local_testing/test_least_busy_routing.py
Normal file
275
tests/local_testing/test_least_busy_routing.py
Normal file
|
@ -0,0 +1,275 @@
|
|||
#### What this tests ####
|
||||
# This tests the router's ability to identify the least busy deployment
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
|
||||
import litellm
|
||||
from litellm import Router
|
||||
from litellm.caching import DualCache
|
||||
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
|
||||
|
||||
### UNIT TESTS FOR LEAST BUSY LOGGING ###
|
||||
|
||||
|
||||
def test_model_added():
|
||||
test_cache = DualCache()
|
||||
least_busy_logger = LeastBusyLoggingHandler(router_cache=test_cache, model_list=[])
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": "gpt-3.5-turbo",
|
||||
"deployment": "azure/chatgpt-v-2",
|
||||
},
|
||||
"model_info": {"id": "1234"},
|
||||
}
|
||||
}
|
||||
least_busy_logger.log_pre_api_call(model="test", messages=[], kwargs=kwargs)
|
||||
request_count_api_key = f"gpt-3.5-turbo_request_count"
|
||||
assert test_cache.get_cache(key=request_count_api_key) is not None
|
||||
|
||||
|
||||
def test_get_available_deployments():
|
||||
test_cache = DualCache()
|
||||
least_busy_logger = LeastBusyLoggingHandler(router_cache=test_cache, model_list=[])
|
||||
model_group = "gpt-3.5-turbo"
|
||||
deployment = "azure/chatgpt-v-2"
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": model_group,
|
||||
"deployment": deployment,
|
||||
},
|
||||
"model_info": {"id": "1234"},
|
||||
}
|
||||
}
|
||||
least_busy_logger.log_pre_api_call(model="test", messages=[], kwargs=kwargs)
|
||||
request_count_api_key = f"{model_group}_request_count"
|
||||
assert test_cache.get_cache(key=request_count_api_key) is not None
|
||||
|
||||
|
||||
# test_get_available_deployments()
|
||||
|
||||
|
||||
def test_router_get_available_deployments():
|
||||
"""
|
||||
Tests if 'get_available_deployments' returns the least busy deployment
|
||||
"""
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 1440,
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-35-turbo",
|
||||
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
|
||||
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-35-turbo",
|
||||
"api_key": "os.environ/AZURE_CANADA_API_KEY",
|
||||
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 3},
|
||||
},
|
||||
]
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
routing_strategy="least-busy",
|
||||
set_verbose=False,
|
||||
num_retries=3,
|
||||
) # type: ignore
|
||||
|
||||
router.leastbusy_logger.test_flag = True
|
||||
|
||||
model_group = "azure-model"
|
||||
deployment = "azure/chatgpt-v-2"
|
||||
request_count_dict = {1: 10, 2: 54, 3: 100}
|
||||
cache_key = f"{model_group}_request_count"
|
||||
router.cache.set_cache(key=cache_key, value=request_count_dict)
|
||||
|
||||
deployment = router.get_available_deployment(model=model_group, messages=None)
|
||||
print(f"deployment: {deployment}")
|
||||
assert deployment["model_info"]["id"] == "1"
|
||||
|
||||
## run router completion - assert completion event, no change in 'busy'ness once calls are complete
|
||||
|
||||
router.completion(
|
||||
model=model_group,
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
|
||||
return_dict = router.cache.get_cache(key=cache_key)
|
||||
|
||||
# wait 2 seconds
|
||||
time.sleep(2)
|
||||
|
||||
assert router.leastbusy_logger.logged_success == 1
|
||||
assert return_dict[1] == 10
|
||||
assert return_dict[2] == 54
|
||||
assert return_dict[3] == 100
|
||||
|
||||
|
||||
## Test with Real calls ##
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_atext_completion_streaming():
|
||||
prompt = "Hello, can you generate a 500 words poem?"
|
||||
model = "azure-model"
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 1440,
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 3},
|
||||
},
|
||||
]
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
routing_strategy="least-busy",
|
||||
set_verbose=False,
|
||||
num_retries=3,
|
||||
) # type: ignore
|
||||
|
||||
### Call the async calls in sequence, so we start 1 call before going to the next.
|
||||
|
||||
## CALL 1
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.atext_completion(model=model, prompt=prompt, stream=True)
|
||||
|
||||
## CALL 2
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.atext_completion(model=model, prompt=prompt, stream=True)
|
||||
|
||||
## CALL 3
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.atext_completion(model=model, prompt=prompt, stream=True)
|
||||
|
||||
cache_key = f"{model}_request_count"
|
||||
## check if calls equally distributed
|
||||
cache_dict = router.cache.get_cache(key=cache_key)
|
||||
for k, v in cache_dict.items():
|
||||
assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"
|
||||
|
||||
|
||||
# asyncio.run(test_router_atext_completion_streaming())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_completion_streaming():
|
||||
litellm.set_verbose = True
|
||||
messages = [
|
||||
{"role": "user", "content": "Hello, can you generate a 500 words poem?"}
|
||||
]
|
||||
model = "azure-model"
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 1440,
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 3},
|
||||
},
|
||||
]
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
routing_strategy="least-busy",
|
||||
set_verbose=False,
|
||||
num_retries=3,
|
||||
) # type: ignore
|
||||
|
||||
### Call the async calls in sequence, so we start 1 call before going to the next.
|
||||
|
||||
## CALL 1
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.acompletion(model=model, messages=messages, stream=True)
|
||||
|
||||
## CALL 2
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.acompletion(model=model, messages=messages, stream=True)
|
||||
|
||||
## CALL 3
|
||||
await asyncio.sleep(random.uniform(0, 2))
|
||||
await router.acompletion(model=model, messages=messages, stream=True)
|
||||
|
||||
cache_key = f"{model}_request_count"
|
||||
## check if calls equally distributed
|
||||
cache_dict = router.cache.get_cache(key=cache_key)
|
||||
for k, v in cache_dict.items():
|
||||
assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"
|
Loading…
Add table
Add a link
Reference in a new issue