forked from phoenix/litellm-mirror
test(test_azure_perf.py): add perf testing for router streaming
This commit is contained in:
parent
49932ac90a
commit
4fa7f19888
2 changed files with 111 additions and 0 deletions
10
litellm/proxy/_test_config.yaml
Normal file
10
litellm/proxy/_test_config.yaml
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
model_list:
|
||||||
|
- model_name: azure-canada
|
||||||
|
litellm_params:
|
||||||
|
model: "azure/gpt-35-turbo"
|
||||||
|
api_key: "73f9a1f564494ce19a26d69afb124219"
|
||||||
|
api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
|
||||||
|
api_version: "2023-07-01-preview"
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
101
litellm/tests/test_azure_perf.py
Normal file
101
litellm/tests/test_azure_perf.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
#### What this tests ####
|
||||||
|
# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
|
||||||
|
import sys, os, time, inspect, asyncio, traceback
|
||||||
|
from datetime import datetime
|
||||||
|
import pytest
|
||||||
|
sys.path.insert(0, os.path.abspath('../..'))
|
||||||
|
import openai, litellm, uuid
|
||||||
|
from openai import AsyncAzureOpenAI
|
||||||
|
|
||||||
|
client = AsyncAzureOpenAI(
|
||||||
|
api_key=os.getenv("AZURE_API_KEY"),
|
||||||
|
azure_endpoint=os.getenv("AZURE_API_BASE"),
|
||||||
|
api_version=os.getenv("AZURE_API_VERSION")
|
||||||
|
)
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure-test",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
router = litellm.Router(model_list=model_list)
|
||||||
|
|
||||||
|
async def _openai_completion():
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model="chatgpt-v-2",
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
time_to_first_token = None
|
||||||
|
first_token_ts = None
|
||||||
|
init_chunk = None
|
||||||
|
async for chunk in response:
|
||||||
|
if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None:
|
||||||
|
first_token_ts = time.time()
|
||||||
|
time_to_first_token = first_token_ts - start_time
|
||||||
|
init_chunk = chunk
|
||||||
|
end_time = time.time()
|
||||||
|
print("OpenAI Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time)
|
||||||
|
return time_to_first_token
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _router_completion():
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure-test",
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
time_to_first_token = None
|
||||||
|
first_token_ts = None
|
||||||
|
init_chunk = None
|
||||||
|
async for chunk in response:
|
||||||
|
if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None:
|
||||||
|
first_token_ts = time.time()
|
||||||
|
time_to_first_token = first_token_ts - start_time
|
||||||
|
init_chunk = chunk
|
||||||
|
end_time = time.time()
|
||||||
|
print("Router Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time - first_token_ts)
|
||||||
|
return time_to_first_token
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def test_azure_completion_streaming():
|
||||||
|
"""
|
||||||
|
Test azure streaming call - measure on time to first (non-null) token.
|
||||||
|
"""
|
||||||
|
n = 3 # Number of concurrent tasks
|
||||||
|
## OPENAI AVG. TIME
|
||||||
|
tasks = [_openai_completion() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
total_time = 0
|
||||||
|
for item in successful_completions:
|
||||||
|
total_time += item
|
||||||
|
avg_openai_time = total_time/3
|
||||||
|
## ROUTER AVG. TIME
|
||||||
|
tasks = [_router_completion() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
total_time = 0
|
||||||
|
for item in successful_completions:
|
||||||
|
total_time += item
|
||||||
|
avg_router_time = total_time/3
|
||||||
|
## COMPARE
|
||||||
|
print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
|
||||||
|
assert avg_router_time < avg_openai_time + 0.5
|
||||||
|
|
||||||
|
asyncio.run(test_azure_completion_streaming())
|
Loading…
Add table
Add a link
Reference in a new issue