litellm-mirror/tests/local_testing/test_azure_perf.py

# #### What this tests ####
# #    This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
# import sys, os, time, inspect, asyncio, traceback
# from datetime import datetime
# import pytest

# sys.path.insert(0, os.path.abspath("../.."))
# import openai, litellm, uuid
# from openai import AsyncAzureOpenAI

# client = AsyncAzureOpenAI(
#     api_key=os.getenv("AZURE_API_KEY"),
#     azure_endpoint=os.getenv("AZURE_API_BASE"),  # type: ignore
#     api_version=os.getenv("AZURE_API_VERSION"),
# )

# model_list = [
#     {
#         "model_name": "azure-test",
#         "litellm_params": {
#             "model": "azure/chatgpt-v-2",
#             "api_key": os.getenv("AZURE_API_KEY"),
#             "api_base": os.getenv("AZURE_API_BASE"),
#             "api_version": os.getenv("AZURE_API_VERSION"),
#         },
#     }
# ]

# router = litellm.Router(model_list=model_list)  # type: ignore


# async def _openai_completion():
#     try:
#         start_time = time.time()
#         response = await client.chat.completions.create(
#             model="chatgpt-v-2",
#             messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
#             stream=True,
#         )
#         time_to_first_token = None
#         first_token_ts = None
#         init_chunk = None
#         async for chunk in response:
#             if (
#                 time_to_first_token is None
#                 and len(chunk.choices) > 0
#                 and chunk.choices[0].delta.content is not None
#             ):
#                 first_token_ts = time.time()
#                 time_to_first_token = first_token_ts - start_time
#                 init_chunk = chunk
#         end_time = time.time()
#         print(
#             "OpenAI Call: ",
#             init_chunk,
#             start_time,
#             first_token_ts,
#             time_to_first_token,
#             end_time,
#         )
#         return time_to_first_token
#     except Exception as e:
#         print(e)
#         return None


# async def _router_completion():
#     try:
#         start_time = time.time()
#         response = await router.acompletion(
#             model="azure-test",
#             messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
#             stream=True,
#         )
#         time_to_first_token = None
#         first_token_ts = None
#         init_chunk = None
#         async for chunk in response:
#             if (
#                 time_to_first_token is None
#                 and len(chunk.choices) > 0
#                 and chunk.choices[0].delta.content is not None
#             ):
#                 first_token_ts = time.time()
#                 time_to_first_token = first_token_ts - start_time
#                 init_chunk = chunk
#         end_time = time.time()
#         print(
#             "Router Call: ",
#             init_chunk,
#             start_time,
#             first_token_ts,
#             time_to_first_token,
#             end_time - first_token_ts,
#         )
#         return time_to_first_token
#     except Exception as e:
#         print(e)
#         return None


# async def test_azure_completion_streaming():
#     """
#     Test azure streaming call - measure on time to first (non-null) token.
#     """
#     n = 3  # Number of concurrent tasks
#     ## OPENAI AVG. TIME
#     tasks = [_openai_completion() for _ in range(n)]
#     chat_completions = await asyncio.gather(*tasks)
#     successful_completions = [c for c in chat_completions if c is not None]
#     total_time = 0
#     for item in successful_completions:
#         total_time += item
#     avg_openai_time = total_time / 3
#     ## ROUTER AVG. TIME
#     tasks = [_router_completion() for _ in range(n)]
#     chat_completions = await asyncio.gather(*tasks)
#     successful_completions = [c for c in chat_completions if c is not None]
#     total_time = 0
#     for item in successful_completions:
#         total_time += item
#     avg_router_time = total_time / 3
#     ## COMPARE
#     print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
#     assert avg_router_time < avg_openai_time + 0.5


# # asyncio.run(test_azure_completion_streaming())