diff --git a/requirements.txt b/requirements.txt index 0ac95fc96..b22edea09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # LITELLM PROXY DEPENDENCIES # anyio==4.4.0 # openai + http req. -openai==1.54.0 # openai req. +openai==1.55.3 # openai req. fastapi==0.111.0 # server dep backoff==2.2.1 # server dep pyyaml==6.0.0 # server dep diff --git a/tests/local_testing/test_azure_perf.py b/tests/local_testing/test_azure_perf.py index 8afc59f92..b7d7abd55 100644 --- a/tests/local_testing/test_azure_perf.py +++ b/tests/local_testing/test_azure_perf.py @@ -1,128 +1,128 @@ -#### What this tests #### -# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk. -import sys, os, time, inspect, asyncio, traceback -from datetime import datetime -import pytest +# #### What this tests #### +# # This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk. +# import sys, os, time, inspect, asyncio, traceback +# from datetime import datetime +# import pytest -sys.path.insert(0, os.path.abspath("../..")) -import openai, litellm, uuid -from openai import AsyncAzureOpenAI +# sys.path.insert(0, os.path.abspath("../..")) +# import openai, litellm, uuid +# from openai import AsyncAzureOpenAI -client = AsyncAzureOpenAI( - api_key=os.getenv("AZURE_API_KEY"), - azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore - api_version=os.getenv("AZURE_API_VERSION"), -) +# client = AsyncAzureOpenAI( +# api_key=os.getenv("AZURE_API_KEY"), +# azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore +# api_version=os.getenv("AZURE_API_VERSION"), +# ) -model_list = [ - { - "model_name": "azure-test", - "litellm_params": { - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_base": os.getenv("AZURE_API_BASE"), - "api_version": os.getenv("AZURE_API_VERSION"), - }, - } -] +# model_list = [ +# { +# "model_name": "azure-test", +# "litellm_params": { +# "model": "azure/chatgpt-v-2", +# "api_key": os.getenv("AZURE_API_KEY"), +# "api_base": os.getenv("AZURE_API_BASE"), +# "api_version": os.getenv("AZURE_API_VERSION"), +# }, +# } +# ] -router = litellm.Router(model_list=model_list) # type: ignore +# router = litellm.Router(model_list=model_list) # type: ignore -async def _openai_completion(): - try: - start_time = time.time() - response = await client.chat.completions.create( - model="chatgpt-v-2", - messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], - stream=True, - ) - time_to_first_token = None - first_token_ts = None - init_chunk = None - async for chunk in response: - if ( - time_to_first_token is None - and len(chunk.choices) > 0 - and chunk.choices[0].delta.content is not None - ): - first_token_ts = time.time() - time_to_first_token = first_token_ts - start_time - init_chunk = chunk - end_time = time.time() - print( - "OpenAI Call: ", - init_chunk, - start_time, - first_token_ts, - time_to_first_token, - end_time, - ) - return time_to_first_token - except Exception as e: - print(e) - return None +# async def _openai_completion(): +# try: +# start_time = time.time() +# response = await client.chat.completions.create( +# model="chatgpt-v-2", +# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], +# stream=True, +# ) +# time_to_first_token = None +# first_token_ts = None +# init_chunk = None +# async for chunk in response: +# if ( +# time_to_first_token is None +# and len(chunk.choices) > 0 +# and chunk.choices[0].delta.content is not None +# ): +# first_token_ts = time.time() +# time_to_first_token = first_token_ts - start_time +# init_chunk = chunk +# end_time = time.time() +# print( +# "OpenAI Call: ", +# init_chunk, +# start_time, +# first_token_ts, +# time_to_first_token, +# end_time, +# ) +# return time_to_first_token +# except Exception as e: +# print(e) +# return None -async def _router_completion(): - try: - start_time = time.time() - response = await router.acompletion( - model="azure-test", - messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], - stream=True, - ) - time_to_first_token = None - first_token_ts = None - init_chunk = None - async for chunk in response: - if ( - time_to_first_token is None - and len(chunk.choices) > 0 - and chunk.choices[0].delta.content is not None - ): - first_token_ts = time.time() - time_to_first_token = first_token_ts - start_time - init_chunk = chunk - end_time = time.time() - print( - "Router Call: ", - init_chunk, - start_time, - first_token_ts, - time_to_first_token, - end_time - first_token_ts, - ) - return time_to_first_token - except Exception as e: - print(e) - return None +# async def _router_completion(): +# try: +# start_time = time.time() +# response = await router.acompletion( +# model="azure-test", +# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], +# stream=True, +# ) +# time_to_first_token = None +# first_token_ts = None +# init_chunk = None +# async for chunk in response: +# if ( +# time_to_first_token is None +# and len(chunk.choices) > 0 +# and chunk.choices[0].delta.content is not None +# ): +# first_token_ts = time.time() +# time_to_first_token = first_token_ts - start_time +# init_chunk = chunk +# end_time = time.time() +# print( +# "Router Call: ", +# init_chunk, +# start_time, +# first_token_ts, +# time_to_first_token, +# end_time - first_token_ts, +# ) +# return time_to_first_token +# except Exception as e: +# print(e) +# return None -async def test_azure_completion_streaming(): - """ - Test azure streaming call - measure on time to first (non-null) token. - """ - n = 3 # Number of concurrent tasks - ## OPENAI AVG. TIME - tasks = [_openai_completion() for _ in range(n)] - chat_completions = await asyncio.gather(*tasks) - successful_completions = [c for c in chat_completions if c is not None] - total_time = 0 - for item in successful_completions: - total_time += item - avg_openai_time = total_time / 3 - ## ROUTER AVG. TIME - tasks = [_router_completion() for _ in range(n)] - chat_completions = await asyncio.gather(*tasks) - successful_completions = [c for c in chat_completions if c is not None] - total_time = 0 - for item in successful_completions: - total_time += item - avg_router_time = total_time / 3 - ## COMPARE - print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}") - assert avg_router_time < avg_openai_time + 0.5 +# async def test_azure_completion_streaming(): +# """ +# Test azure streaming call - measure on time to first (non-null) token. +# """ +# n = 3 # Number of concurrent tasks +# ## OPENAI AVG. TIME +# tasks = [_openai_completion() for _ in range(n)] +# chat_completions = await asyncio.gather(*tasks) +# successful_completions = [c for c in chat_completions if c is not None] +# total_time = 0 +# for item in successful_completions: +# total_time += item +# avg_openai_time = total_time / 3 +# ## ROUTER AVG. TIME +# tasks = [_router_completion() for _ in range(n)] +# chat_completions = await asyncio.gather(*tasks) +# successful_completions = [c for c in chat_completions if c is not None] +# total_time = 0 +# for item in successful_completions: +# total_time += item +# avg_router_time = total_time / 3 +# ## COMPARE +# print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}") +# assert avg_router_time < avg_openai_time + 0.5 -# asyncio.run(test_azure_completion_streaming()) +# # asyncio.run(test_azure_completion_streaming())