diff --git a/litellm/proxy/_test_config.yaml b/litellm/proxy/_test_config.yaml deleted file mode 100644 index 18210a0a3..000000000 --- a/litellm/proxy/_test_config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_list: - - model_name: azure-canada - litellm_params: - model: "azure/gpt-35-turbo" - api_key: "73f9a1f564494ce19a26d69afb124219" - api_base: "https://my-endpoint-canada-berri992.openai.azure.com" - api_version: "2023-07-01-preview" - -general_settings: - master_key: sk-1234 \ No newline at end of file diff --git a/litellm/tests/test_azure_perf.py b/litellm/tests/test_azure_perf.py deleted file mode 100644 index 67cb41863..000000000 --- a/litellm/tests/test_azure_perf.py +++ /dev/null @@ -1,101 +0,0 @@ -#### What this tests #### -# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk. -import sys, os, time, inspect, asyncio, traceback -from datetime import datetime -import pytest -sys.path.insert(0, os.path.abspath('../..')) -import openai, litellm, uuid -from openai import AsyncAzureOpenAI - -client = AsyncAzureOpenAI( - api_key=os.getenv("AZURE_API_KEY"), - azure_endpoint=os.getenv("AZURE_API_BASE"), - api_version=os.getenv("AZURE_API_VERSION") -) - -model_list = [ - { - "model_name": "azure-test", - "litellm_params": { - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_base": os.getenv("AZURE_API_BASE"), - "api_version": os.getenv("AZURE_API_VERSION") - } - } -] - -router = litellm.Router(model_list=model_list) - -async def _openai_completion(): - try: - start_time = time.time() - response = await client.chat.completions.create( - model="chatgpt-v-2", - messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], - stream=True - ) - time_to_first_token = None - first_token_ts = None - init_chunk = None - async for chunk in response: - if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: - first_token_ts = time.time() - time_to_first_token = first_token_ts - start_time - init_chunk = chunk - end_time = time.time() - print("OpenAI Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time) - return time_to_first_token - except Exception as e: - print(e) - return None - -async def _router_completion(): - try: - start_time = time.time() - response = await router.acompletion( - model="azure-test", - messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], - stream=True - ) - time_to_first_token = None - first_token_ts = None - init_chunk = None - async for chunk in response: - if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: - first_token_ts = time.time() - time_to_first_token = first_token_ts - start_time - init_chunk = chunk - end_time = time.time() - print("Router Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time - first_token_ts) - return time_to_first_token - except Exception as e: - print(e) - return None - -async def test_azure_completion_streaming(): - """ - Test azure streaming call - measure on time to first (non-null) token. - """ - n = 3 # Number of concurrent tasks - ## OPENAI AVG. TIME - tasks = [_openai_completion() for _ in range(n)] - chat_completions = await asyncio.gather(*tasks) - successful_completions = [c for c in chat_completions if c is not None] - total_time = 0 - for item in successful_completions: - total_time += item - avg_openai_time = total_time/3 - ## ROUTER AVG. TIME - tasks = [_router_completion() for _ in range(n)] - chat_completions = await asyncio.gather(*tasks) - successful_completions = [c for c in chat_completions if c is not None] - total_time = 0 - for item in successful_completions: - total_time += item - avg_router_time = total_time/3 - ## COMPARE - print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}") - assert avg_router_time < avg_openai_time + 0.5 - -asyncio.run(test_azure_completion_streaming()) \ No newline at end of file