forked from phoenix/litellm-mirror
test - lowest latency logger
This commit is contained in:
parent
bf6abed808
commit
2e6fc91a75
1 changed files with 78 additions and 0 deletions
|
@ -477,3 +477,81 @@ async def test_router_completion_streaming():
|
||||||
|
|
||||||
|
|
||||||
# asyncio.run(test_router_completion_streaming())
|
# asyncio.run(test_router_completion_streaming())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_lowest_latency_routing_with_timeouts():
|
||||||
|
"""
|
||||||
|
PROD Test:
|
||||||
|
- Endpoint 1: triggers timeout errors (it takes 10+ seconds to respond)
|
||||||
|
- Endpoint 2: Responds in under 1s
|
||||||
|
- Run 5 requests to collect data on latency
|
||||||
|
- Run Wait till cache is filled with data
|
||||||
|
- Run 10 more requests
|
||||||
|
- All requests should have been routed to endpoint 2
|
||||||
|
"""
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/slow-endpoint",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production-c715.up.railway.app/", # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "slow-endpoint"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/fast-endpoint",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "fast-endpoint"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
routing_strategy="latency-based-routing",
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
|
timeout=1,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
# make 4 requests
|
||||||
|
for _ in range(4):
|
||||||
|
try:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure-model", messages=[{"role": "user", "content": "hello"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
except Exception as e:
|
||||||
|
print("got exception", e)
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
print("done sending initial requests to collect latency")
|
||||||
|
"""
|
||||||
|
Note: for debugging
|
||||||
|
- By this point: slow-endpoint should have timed out 3-4 times and should be heavily penalized :)
|
||||||
|
- The next 10 requests should all be routed to the fast-endpoint
|
||||||
|
"""
|
||||||
|
|
||||||
|
deployments = {}
|
||||||
|
# make 10 requests
|
||||||
|
for _ in range(10):
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure-model", messages=[{"role": "user", "content": "hello"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
_picked_model_id = response._hidden_params["model_id"]
|
||||||
|
if _picked_model_id not in deployments:
|
||||||
|
deployments[_picked_model_id] = 1
|
||||||
|
else:
|
||||||
|
deployments[_picked_model_id] += 1
|
||||||
|
print("deployments", deployments)
|
||||||
|
|
||||||
|
# ALL the Requests should have been routed to the fast-endpoint
|
||||||
|
assert deployments["fast-endpoint"] == 10
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue