refactor: move all testing to top-level of repo

Closes https://github.com/BerriAI/litellm/issues/486
2024-09-28 13:23:39 -07:00 · 2024-09-28 13:23:39 -07:00 · 3560f0ef2c
commit 3560f0ef2c
parent 5403c5828c
213 changed files with 74 additions and 217 deletions
--- a/tests/local_testing/test_azure_perf.py
+++ b/tests/local_testing/test_azure_perf.py
@ -0,0 +1,128 @@
+#### What this tests ####
+#    This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
+import sys, os, time, inspect, asyncio, traceback
+from datetime import datetime
+import pytest
+
+sys.path.insert(0, os.path.abspath("../.."))
+import openai, litellm, uuid
+from openai import AsyncAzureOpenAI
+
+client = AsyncAzureOpenAI(
+    api_key=os.getenv("AZURE_API_KEY"),
+    azure_endpoint=os.getenv("AZURE_API_BASE"),  # type: ignore
+    api_version=os.getenv("AZURE_API_VERSION"),
+)
+
+model_list = [
+    {
+        "model_name": "azure-test",
+        "litellm_params": {
+            "model": "azure/chatgpt-v-2",
+            "api_key": os.getenv("AZURE_API_KEY"),
+            "api_base": os.getenv("AZURE_API_BASE"),
+            "api_version": os.getenv("AZURE_API_VERSION"),
+        },
+    }
+]
+
+router = litellm.Router(model_list=model_list)  # type: ignore
+
+
+async def _openai_completion():
+    try:
+        start_time = time.time()
+        response = await client.chat.completions.create(
+            model="chatgpt-v-2",
+            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+            stream=True,
+        )
+        time_to_first_token = None
+        first_token_ts = None
+        init_chunk = None
+        async for chunk in response:
+            if (
+                time_to_first_token is None
+                and len(chunk.choices) > 0
+                and chunk.choices[0].delta.content is not None
+            ):
+                first_token_ts = time.time()
+                time_to_first_token = first_token_ts - start_time
+                init_chunk = chunk
+        end_time = time.time()
+        print(
+            "OpenAI Call: ",
+            init_chunk,
+            start_time,
+            first_token_ts,
+            time_to_first_token,
+            end_time,
+        )
+        return time_to_first_token
+    except Exception as e:
+        print(e)
+        return None
+
+
+async def _router_completion():
+    try:
+        start_time = time.time()
+        response = await router.acompletion(
+            model="azure-test",
+            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+            stream=True,
+        )
+        time_to_first_token = None
+        first_token_ts = None
+        init_chunk = None
+        async for chunk in response:
+            if (
+                time_to_first_token is None
+                and len(chunk.choices) > 0
+                and chunk.choices[0].delta.content is not None
+            ):
+                first_token_ts = time.time()
+                time_to_first_token = first_token_ts - start_time
+                init_chunk = chunk
+        end_time = time.time()
+        print(
+            "Router Call: ",
+            init_chunk,
+            start_time,
+            first_token_ts,
+            time_to_first_token,
+            end_time - first_token_ts,
+        )
+        return time_to_first_token
+    except Exception as e:
+        print(e)
+        return None
+
+
+async def test_azure_completion_streaming():
+    """
+    Test azure streaming call - measure on time to first (non-null) token.
+    """
+    n = 3  # Number of concurrent tasks
+    ## OPENAI AVG. TIME
+    tasks = [_openai_completion() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    total_time = 0
+    for item in successful_completions:
+        total_time += item
+    avg_openai_time = total_time / 3
+    ## ROUTER AVG. TIME
+    tasks = [_router_completion() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    total_time = 0
+    for item in successful_completions:
+        total_time += item
+    avg_router_time = total_time / 3
+    ## COMPARE
+    print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
+    assert avg_router_time < avg_openai_time + 0.5
+
+
+# asyncio.run(test_azure_completion_streaming())