test(test_azure_perf.py): add perf testing for router streaming

2023-12-23 13:16:49 +05:30 · 2023-12-23 13:16:49 +05:30 · 4fa7f19888
commit 4fa7f19888
parent 49932ac90a
2 changed files with 111 additions and 0 deletions
--- a/litellm/proxy/_test_config.yaml
+++ b/litellm/proxy/_test_config.yaml
@ -0,0 +1,10 @@
+model_list: 
+  - model_name: azure-canada
+    litellm_params: 
+      model: "azure/gpt-35-turbo"
+      api_key: "73f9a1f564494ce19a26d69afb124219"
+      api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
+      api_version: "2023-07-01-preview"
+
+general_settings: 
+  master_key: sk-1234
--- a/litellm/tests/test_azure_perf.py
+++ b/litellm/tests/test_azure_perf.py
@ -0,0 +1,101 @@
+#### What this tests ####
+#    This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
+import sys, os, time, inspect, asyncio, traceback
+from datetime import datetime
+import pytest
+sys.path.insert(0, os.path.abspath('../..'))
+import openai, litellm, uuid
+from openai import AsyncAzureOpenAI
+
+client = AsyncAzureOpenAI(
+    api_key=os.getenv("AZURE_API_KEY"),
+    azure_endpoint=os.getenv("AZURE_API_BASE"),
+    api_version=os.getenv("AZURE_API_VERSION")
+)
+
+model_list = [
+  {
+    "model_name": "azure-test",
+    "litellm_params": {
+      "model": "azure/chatgpt-v-2",
+      "api_key": os.getenv("AZURE_API_KEY"),
+      "api_base": os.getenv("AZURE_API_BASE"),
+      "api_version": os.getenv("AZURE_API_VERSION")
+    }
+  }
+]
+
+router = litellm.Router(model_list=model_list)
+
+async def _openai_completion():
+  try:
+    start_time = time.time()
+    response = await client.chat.completions.create(
+              model="chatgpt-v-2",
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+              stream=True
+          )
+    time_to_first_token = None
+    first_token_ts = None
+    init_chunk = None
+    async for chunk in response: 
+      if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None:
+        first_token_ts = time.time()
+        time_to_first_token = first_token_ts - start_time
+        init_chunk = chunk
+    end_time = time.time()
+    print("OpenAI Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time)
+    return time_to_first_token
+  except Exception as e:
+    print(e)
+    return None
+
+async def _router_completion():
+  try:
+    start_time = time.time()
+    response = await router.acompletion(
+              model="azure-test",
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+              stream=True
+          )
+    time_to_first_token = None
+    first_token_ts = None
+    init_chunk = None
+    async for chunk in response: 
+      if time_to_first_token is None and len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None:
+        first_token_ts = time.time()
+        time_to_first_token = first_token_ts - start_time
+        init_chunk = chunk
+    end_time = time.time()
+    print("Router Call: ",init_chunk, start_time, first_token_ts, time_to_first_token, end_time - first_token_ts)
+    return time_to_first_token
+  except Exception as e:
+    print(e)
+    return None
+
+async def test_azure_completion_streaming(): 
+    """
+    Test azure streaming call - measure on time to first (non-null) token. 
+    """
+    n = 3  # Number of concurrent tasks
+    ## OPENAI AVG. TIME
+    tasks = [_openai_completion() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    total_time = 0
+    for item in successful_completions: 
+      total_time += item
+    avg_openai_time = total_time/3
+    ## ROUTER AVG. TIME
+    tasks = [_router_completion() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    total_time = 0
+    for item in successful_completions: 
+      total_time += item
+    avg_router_time = total_time/3
+    ## COMPARE
+    print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
+    assert avg_router_time < avg_openai_time + 0.5
+
+asyncio.run(test_azure_completion_streaming())