diff --git a/litellm/proxy/_test_config.yaml b/litellm/proxy/_test_config.yaml new file mode 100644 index 000000000..18210a0a3 --- /dev/null +++ b/litellm/proxy/_test_config.yaml @@ -0,0 +1,10 @@ +model_list: + - model_name: azure-canada + litellm_params: + model: "azure/gpt-35-turbo" + api_key: "73f9a1f564494ce19a26d69afb124219" + api_base: "https://my-endpoint-canada-berri992.openai.azure.com" + api_version: "2023-07-01-preview" + +general_settings: + master_key: sk-1234 \ No newline at end of file diff --git a/litellm/tests/test_azure_perf.py b/litellm/tests/test_azure_perf.py new file mode 100644 index 000000000..67cb41863 --- /dev/null +++ b/litellm/tests/test_azure_perf.py @@ -0,0 +1,101 @@ +#### What this tests #### +# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk. +import sys, os, time, inspect, asyncio, traceback +from datetime import datetime +import pytest +sys.path.insert(0, os.path.abspath('../..')) +import openai, litellm, uuid +from openai import AsyncAzureOpenAI + +client = AsyncAzureOpenAI( + api_key=os.getenv("AZURE_API_KEY"), + azure_endpoint=os.getenv("AZURE_API_BASE"), + api_version=os.getenv("AZURE_API_VERSION") +) + +model_list = [ + { + "model_name": "azure-test", + "litellm_params": { + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_base": os.getenv("AZURE_API_BASE"), + "api_version": os.getenv("AZURE_API_VERSION") + } + } +] + +router = litellm.Router(model_list=model_list) + +async def _openai_completion(): + try: + start_time = time.time() + response = await client.chat.completions.create( + model="chatgpt-v-2", + messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], + stream=True + ) + time_to_first_token = None + first_token_ts = None + init_chunk = None + async for chunk in response: + if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: + first_token_ts = time.time() + time_to_first_token = first_token_ts - start_time + init_chunk = chunk + end_time = time.time() + print("OpenAI Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time) + return time_to_first_token + except Exception as e: + print(e) + return None + +async def _router_completion(): + try: + start_time = time.time() + response = await router.acompletion( + model="azure-test", + messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], + stream=True + ) + time_to_first_token = None + first_token_ts = None + init_chunk = None + async for chunk in response: + if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: + first_token_ts = time.time() + time_to_first_token = first_token_ts - start_time + init_chunk = chunk + end_time = time.time() + print("Router Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time - first_token_ts) + return time_to_first_token + except Exception as e: + print(e) + return None + +async def test_azure_completion_streaming(): + """ + Test azure streaming call - measure on time to first (non-null) token. + """ + n = 3 # Number of concurrent tasks + ## OPENAI AVG. TIME + tasks = [_openai_completion() for _ in range(n)] + chat_completions = await asyncio.gather(*tasks) + successful_completions = [c for c in chat_completions if c is not None] + total_time = 0 + for item in successful_completions: + total_time += item + avg_openai_time = total_time/3 + ## ROUTER AVG. TIME + tasks = [_router_completion() for _ in range(n)] + chat_completions = await asyncio.gather(*tasks) + successful_completions = [c for c in chat_completions if c is not None] + total_time = 0 + for item in successful_completions: + total_time += item + avg_router_time = total_time/3 + ## COMPARE + print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}") + assert avg_router_time < avg_openai_time + 0.5 + +asyncio.run(test_azure_completion_streaming()) \ No newline at end of file