forked from phoenix/litellm-mirror
feat(lowest_latency.py): route by time to first token, for streaming requests (if available)
Closes https://github.com/BerriAI/litellm/issues/3574
This commit is contained in:
parent
620e6db027
commit
2b3da449c8
3 changed files with 232 additions and 18 deletions
|
@ -2,7 +2,7 @@
|
|||
# This tests the router's ability to pick deployment with lowest latency
|
||||
|
||||
import sys, os, asyncio, time, random
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
@ -16,6 +16,7 @@ import pytest
|
|||
from litellm import Router
|
||||
from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
|
||||
from litellm.caching import DualCache
|
||||
import litellm
|
||||
|
||||
### UNIT TESTS FOR LATENCY ROUTING ###
|
||||
|
||||
|
@ -813,3 +814,143 @@ async def test_lowest_latency_routing_buffer(buffer):
|
|||
assert len(selected_deployments.keys()) == 1
|
||||
else:
|
||||
assert len(selected_deployments.keys()) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_lowest_latency_routing_time_to_first_token(sync_mode):
|
||||
"""
|
||||
If a deployment has
|
||||
- a fast time to first token
|
||||
- slow latency/output token
|
||||
|
||||
test if:
|
||||
- for streaming, the deployment with fastest time to first token is picked
|
||||
- for non-streaming, fastest overall deployment is picked
|
||||
"""
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-35-turbo",
|
||||
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
|
||||
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
]
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
routing_strategy="latency-based-routing",
|
||||
set_verbose=False,
|
||||
num_retries=3,
|
||||
) # type: ignore
|
||||
## DEPLOYMENT 1 ##
|
||||
deployment_id = 1
|
||||
start_time = datetime.now()
|
||||
one_second_later = start_time + timedelta(seconds=1)
|
||||
|
||||
# Compute 3 seconds after the current time
|
||||
three_seconds_later = start_time + timedelta(seconds=3)
|
||||
four_seconds_later = start_time + timedelta(seconds=4)
|
||||
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": "azure-model",
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
"stream": True,
|
||||
"completion_start_time": one_second_later,
|
||||
}
|
||||
|
||||
response_obj = litellm.ModelResponse(
|
||||
usage=litellm.Usage(completion_tokens=50, total_tokens=50)
|
||||
)
|
||||
end_time = four_seconds_later
|
||||
|
||||
if sync_mode:
|
||||
router.lowestlatency_logger.log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
else:
|
||||
await router.lowestlatency_logger.async_log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
## DEPLOYMENT 2 ##
|
||||
deployment_id = 2
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": "azure-model",
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
"stream": True,
|
||||
"completion_start_time": three_seconds_later,
|
||||
}
|
||||
response_obj = litellm.ModelResponse(
|
||||
usage=litellm.Usage(completion_tokens=50, total_tokens=50)
|
||||
)
|
||||
end_time = three_seconds_later
|
||||
if sync_mode:
|
||||
router.lowestlatency_logger.log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
else:
|
||||
await router.lowestlatency_logger.async_log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
"""
|
||||
TESTING
|
||||
|
||||
- expect deployment 1 to be picked for streaming
|
||||
- expect deployment 2 to be picked for non-streaming
|
||||
"""
|
||||
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
|
||||
selected_deployments = {}
|
||||
for _ in range(3):
|
||||
print(router.get_available_deployment(model="azure-model"))
|
||||
## for non-streaming
|
||||
selected_deployments[
|
||||
router.get_available_deployment(model="azure-model")["model_info"]["id"]
|
||||
] = 1
|
||||
|
||||
assert len(selected_deployments.keys()) == 1
|
||||
assert "2" in list(selected_deployments.keys())
|
||||
|
||||
selected_deployments = {}
|
||||
for _ in range(50):
|
||||
print(router.get_available_deployment(model="azure-model"))
|
||||
## for non-streaming
|
||||
selected_deployments[
|
||||
router.get_available_deployment(
|
||||
model="azure-model", request_kwargs={"stream": True}
|
||||
)["model_info"]["id"]
|
||||
] = 1
|
||||
|
||||
assert len(selected_deployments.keys()) == 1
|
||||
assert "1" in list(selected_deployments.keys())
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue