[Fix] Performance - use in memory cache when downloading images from a url (#5657)

* fix use in memory cache when getting images * fix linting * fix load testing * fix load test size * fix load test size * trigger ci/cd again
2024-09-13 07:23:42 -07:00 · 2024-09-13 07:23:42 -07:00 · cd8d7ca915
commit cd8d7ca915
parent cdd7cd4d69
5 changed files with 249 additions and 38 deletions
--- a/tests/load_tests/test_vertex_load_tests.py
+++ b/tests/load_tests/test_vertex_load_tests.py
@ -0,0 +1,149 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath("../.."))
+
+import asyncio
+import litellm
+import pytest
+import time
+import json
+import tempfile
+from dotenv import load_dotenv
+
+
+def load_vertex_ai_credentials():
+    # Define the path to the vertex_key.json file
+    print("loading vertex ai credentials")
+    filepath = os.path.dirname(os.path.abspath(__file__))
+    vertex_key_path = filepath + "/vertex_key.json"
+
+    # Read the existing content of the file or create an empty dictionary
+    try:
+        with open(vertex_key_path, "r") as file:
+            # Read the file content
+            print("Read vertexai file path")
+            content = file.read()
+
+            # If the file is empty or not valid JSON, create an empty dictionary
+            if not content or not content.strip():
+                service_account_key_data = {}
+            else:
+                # Attempt to load the existing JSON content
+                file.seek(0)
+                service_account_key_data = json.load(file)
+    except FileNotFoundError:
+        # If the file doesn't exist, create an empty dictionary
+        service_account_key_data = {}
+
+    # Update the service_account_key_data with environment variables
+    private_key_id = os.environ.get("VERTEX_AI_PRIVATE_KEY_ID", "")
+    private_key = os.environ.get("VERTEX_AI_PRIVATE_KEY", "")
+    private_key = private_key.replace("\\n", "\n")
+    service_account_key_data["private_key_id"] = private_key_id
+    service_account_key_data["private_key"] = private_key
+
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
+        # Write the updated content to the temporary files
+        json.dump(service_account_key_data, temp_file, indent=2)
+
+    # Export the temporary file as GOOGLE_APPLICATION_CREDENTIALS
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(temp_file.name)
+
+
+@pytest.mark.asyncio
+async def test_vertex_load():
+    try:
+        load_vertex_ai_credentials()
+        percentage_diffs = []
+
+        for run in range(3):
+            print(f"\nRun {run + 1}:")
+
+            # Test with text-only message
+            start_time_text = await make_async_calls(message_type="text")
+            print("Done with text-only message test")
+
+            # Test with text + image message
+            start_time_image = await make_async_calls(message_type="image")
+            print("Done with text + image message test")
+
+            # Compare times and calculate percentage difference
+            print(f"Time with text-only message: {start_time_text}")
+            print(f"Time with text + image message: {start_time_image}")
+
+            percentage_diff = (
+                (start_time_image - start_time_text) / start_time_text * 100
+            )
+            percentage_diffs.append(percentage_diff)
+            print(f"Performance difference: {percentage_diff:.2f}%")
+
+        print("percentage_diffs", percentage_diffs)
+        # Calculate average percentage difference
+        avg_percentage_diff = sum(percentage_diffs) / len(percentage_diffs)
+        print(f"\nAverage performance difference: {avg_percentage_diff:.2f}%")
+
+        # Assert that the average difference is not more than 20%
+        assert (
+            avg_percentage_diff < 20
+        ), f"Average performance difference of {avg_percentage_diff:.2f}% exceeds 20% threshold"
+
+    except litellm.Timeout as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {e}")
+
+
+async def make_async_calls(message_type="text"):
+    total_tasks = 3
+    batch_size = 1
+    total_time = 0
+
+    for batch in range(3):
+        tasks = [create_async_task(message_type) for _ in range(batch_size)]
+
+        start_time = asyncio.get_event_loop().time()
+        responses = await asyncio.gather(*tasks)
+
+        for idx, response in enumerate(responses):
+            print(f"Response from Task {batch * batch_size + idx + 1}: {response}")
+
+        await asyncio.sleep(1)
+
+        batch_time = asyncio.get_event_loop().time() - start_time
+        total_time += batch_time
+
+    return total_time
+
+
+def create_async_task(message_type):
+    base_url = "https://exampleopenaiendpoint-production.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001"
+
+    if message_type == "text":
+        messages = [{"role": "user", "content": "hi"}]
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
+                        },
+                    },
+                ],
+            }
+        ]
+
+    completion_args = {
+        "model": "vertex_ai/gemini",
+        "messages": messages,
+        "max_tokens": 5,
+        "temperature": 0.7,
+        "timeout": 10,
+        "api_base": base_url,
+    }
+    return asyncio.create_task(litellm.acompletion(**completion_args))