feat(lowest_latency.py): route by time to first token, for streaming requests (if available)

Closes https://github.com/BerriAI/litellm/issues/3574
This commit is contained in:
Krrish Dholakia 2024-05-21 13:08:17 -07:00
parent 620e6db027
commit 2b3da449c8
3 changed files with 232 additions and 18 deletions

View file

@ -2,7 +2,7 @@
# This tests the router's ability to pick deployment with lowest latency
import sys, os, asyncio, time, random
from datetime import datetime
from datetime import datetime, timedelta
import traceback
from dotenv import load_dotenv
@ -16,6 +16,7 @@ import pytest
from litellm import Router
from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
from litellm.caching import DualCache
import litellm
### UNIT TESTS FOR LATENCY ROUTING ###
@ -813,3 +814,143 @@ async def test_lowest_latency_routing_buffer(buffer):
assert len(selected_deployments.keys()) == 1
else:
assert len(selected_deployments.keys()) == 2
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_lowest_latency_routing_time_to_first_token(sync_mode):
"""
If a deployment has
- a fast time to first token
- slow latency/output token
test if:
- for streaming, the deployment with fastest time to first token is picked
- for non-streaming, fastest overall deployment is picked
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
routing_strategy="latency-based-routing",
set_verbose=False,
num_retries=3,
) # type: ignore
## DEPLOYMENT 1 ##
deployment_id = 1
start_time = datetime.now()
one_second_later = start_time + timedelta(seconds=1)
# Compute 3 seconds after the current time
three_seconds_later = start_time + timedelta(seconds=3)
four_seconds_later = start_time + timedelta(seconds=4)
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 1},
},
"stream": True,
"completion_start_time": one_second_later,
}
response_obj = litellm.ModelResponse(
usage=litellm.Usage(completion_tokens=50, total_tokens=50)
)
end_time = four_seconds_later
if sync_mode:
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
else:
await router.lowestlatency_logger.async_log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## DEPLOYMENT 2 ##
deployment_id = 2
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 2},
},
"stream": True,
"completion_start_time": three_seconds_later,
}
response_obj = litellm.ModelResponse(
usage=litellm.Usage(completion_tokens=50, total_tokens=50)
)
end_time = three_seconds_later
if sync_mode:
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
else:
await router.lowestlatency_logger.async_log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
"""
TESTING
- expect deployment 1 to be picked for streaming
- expect deployment 2 to be picked for non-streaming
"""
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
selected_deployments = {}
for _ in range(3):
print(router.get_available_deployment(model="azure-model"))
## for non-streaming
selected_deployments[
router.get_available_deployment(model="azure-model")["model_info"]["id"]
] = 1
assert len(selected_deployments.keys()) == 1
assert "2" in list(selected_deployments.keys())
selected_deployments = {}
for _ in range(50):
print(router.get_available_deployment(model="azure-model"))
## for non-streaming
selected_deployments[
router.get_available_deployment(
model="azure-model", request_kwargs={"stream": True}
)["model_info"]["id"]
] = 1
assert len(selected_deployments.keys()) == 1
assert "1" in list(selected_deployments.keys())