[Fix] Performance - use in memory cache when downloading images from a url (#5657)

* fix use in memory cache when getting images * fix linting * fix load testing * fix load test size * fix load test size * trigger ci/cd again
2024-09-13 07:23:42 -07:00 · 2024-09-13 07:23:42 -07:00 · cd8d7ca915
commit cd8d7ca915
parent cdd7cd4d69
5 changed files with 249 additions and 38 deletions
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -37,6 +37,8 @@ from litellm.types.llms.openai import (
 )
 from litellm.types.utils import GenericImageParsingChunk
 from .image_handling import async_convert_url_to_base64, convert_url_to_base64
 def default_pt(messages):
    return " ".join(message["content"] for message in messages)
@ -703,44 +705,6 @@ def construct_tool_use_system_prompt(
    return tool_use_system_prompt
 def convert_url_to_base64(url):
    import base64
    client = HTTPHandler(concurrent_limit=1)
    for _ in range(3):
        try:
            response = client.get(url)
            break
        except:
            pass
    if response.status_code == 200:
        image_bytes = response.content
        base64_image = base64.b64encode(image_bytes).decode("utf-8")
        image_type = response.headers.get("Content-Type", None)
        if image_type is not None:
            img_type = image_type
        else:
            img_type = url.split(".")[-1].lower()
            if img_type == "jpg" or img_type == "jpeg":
                img_type = "image/jpeg"
            elif img_type == "png":
                img_type = "image/png"
            elif img_type == "gif":
                img_type = "image/gif"
            elif img_type == "webp":
                img_type = "image/webp"
            else:
                raise Exception(
                    f"Error: Unsupported image format. Format={img_type}. Supported types = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']"
                )
        return f"data:{img_type};base64,{base64_image}"
    else:
        raise Exception(f"Error: Unable to fetch image from URL. url={url}")
 def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
    """
    Input:
--- a/litellm/llms/prompt_templates/image_handling.py
+++ b/litellm/llms/prompt_templates/image_handling.py
@ -0,0 +1,84 @@
 """
 Helper functions to handle images passed in messages
 """
 import base64
 from httpx import Response
 import litellm
 from litellm.caching import InMemoryCache
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
 )
 MAX_IMGS_IN_MEMORY = 10
 in_memory_cache = InMemoryCache(max_size_in_memory=MAX_IMGS_IN_MEMORY)
 def _process_image_response(response: Response, url: str) -> str:
    if response.status_code != 200:
        raise Exception(
            f"Error: Unable to fetch image from URL. Status code: {response.status_code}, url={url}"
        )
    image_bytes = response.content
    base64_image = base64.b64encode(image_bytes).decode("utf-8")
    image_type = response.headers.get("Content-Type")
    if image_type is None:
        img_type = url.split(".")[-1].lower()
        _img_type = {
            "jpg": "image/jpeg",
            "jpeg": "image/jpeg",
            "png": "image/png",
            "gif": "image/gif",
            "webp": "image/webp",
        }.get(img_type)
        if _img_type is None:
            raise Exception(
                f"Error: Unsupported image format. Format={_img_type}. Supported types = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']"
            )
        img_type = _img_type
    else:
        img_type = image_type
    result = f"data:{img_type};base64,{base64_image}"
    in_memory_cache.set_cache(url, result)
    return result
 async def async_convert_url_to_base64(url: str) -> str:
    cached_result = in_memory_cache.get_cache(url)
    if cached_result:
        return cached_result
    client = litellm.module_level_aclient
    for _ in range(3):
        try:
            response = await client.get(url)
            return _process_image_response(response, url)
        except:
            pass
    raise Exception(
        f"Error: Unable to fetch image from URL after 3 attempts. url={url}"
    )
 def convert_url_to_base64(url: str) -> str:
    cached_result = in_memory_cache.get_cache(url)
    if cached_result:
        return cached_result
    client = litellm.module_level_client
    for _ in range(3):
        try:
            response = client.get(url)
            return _process_image_response(response, url)
        except:
            pass
    raise Exception(
        f"Error: Unable to fetch image from URL after 3 attempts. url={url}"
    )
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -25,6 +25,7 @@ from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import anthropic_messages_pt
 # litellm.num_retries =3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
--- a/tests/load_tests/test_vertex_load_tests.py
+++ b/tests/load_tests/test_vertex_load_tests.py
@ -0,0 +1,149 @@
 import sys
 import os
 sys.path.insert(0, os.path.abspath("../.."))
 import asyncio
 import litellm
 import pytest
 import time
 import json
 import tempfile
 from dotenv import load_dotenv
 def load_vertex_ai_credentials():
    # Define the path to the vertex_key.json file
    print("loading vertex ai credentials")
    filepath = os.path.dirname(os.path.abspath(__file__))
    vertex_key_path = filepath + "/vertex_key.json"
    # Read the existing content of the file or create an empty dictionary
    try:
        with open(vertex_key_path, "r") as file:
            # Read the file content
            print("Read vertexai file path")
            content = file.read()
            # If the file is empty or not valid JSON, create an empty dictionary
            if not content or not content.strip():
                service_account_key_data = {}
            else:
                # Attempt to load the existing JSON content
                file.seek(0)
                service_account_key_data = json.load(file)
    except FileNotFoundError:
        # If the file doesn't exist, create an empty dictionary
        service_account_key_data = {}
    # Update the service_account_key_data with environment variables
    private_key_id = os.environ.get("VERTEX_AI_PRIVATE_KEY_ID", "")
    private_key = os.environ.get("VERTEX_AI_PRIVATE_KEY", "")
    private_key = private_key.replace("\\n", "\n")
    service_account_key_data["private_key_id"] = private_key_id
    service_account_key_data["private_key"] = private_key
    # Create a temporary file
    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
        # Write the updated content to the temporary files
        json.dump(service_account_key_data, temp_file, indent=2)
    # Export the temporary file as GOOGLE_APPLICATION_CREDENTIALS
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(temp_file.name)
@pytest.mark.asyncio
 async def test_vertex_load():
    try:
        load_vertex_ai_credentials()
        percentage_diffs = []
        for run in range(3):
            print(f"\nRun {run + 1}:")
            # Test with text-only message
            start_time_text = await make_async_calls(message_type="text")
            print("Done with text-only message test")
            # Test with text + image message
            start_time_image = await make_async_calls(message_type="image")
            print("Done with text + image message test")
            # Compare times and calculate percentage difference
            print(f"Time with text-only message: {start_time_text}")
            print(f"Time with text + image message: {start_time_image}")
            percentage_diff = (
                (start_time_image - start_time_text) / start_time_text * 100
            )
            percentage_diffs.append(percentage_diff)
            print(f"Performance difference: {percentage_diff:.2f}%")
        print("percentage_diffs", percentage_diffs)
        # Calculate average percentage difference
        avg_percentage_diff = sum(percentage_diffs) / len(percentage_diffs)
        print(f"\nAverage performance difference: {avg_percentage_diff:.2f}%")
        # Assert that the average difference is not more than 20%
        assert (
            avg_percentage_diff < 20
        ), f"Average performance difference of {avg_percentage_diff:.2f}% exceeds 20% threshold"
    except litellm.Timeout as e:
        pass
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
 async def make_async_calls(message_type="text"):
    total_tasks = 3
    batch_size = 1
    total_time = 0
    for batch in range(3):
        tasks = [create_async_task(message_type) for _ in range(batch_size)]
        start_time = asyncio.get_event_loop().time()
        responses = await asyncio.gather(*tasks)
        for idx, response in enumerate(responses):
            print(f"Response from Task {batch * batch_size + idx + 1}: {response}")
        await asyncio.sleep(1)
        batch_time = asyncio.get_event_loop().time() - start_time
        total_time += batch_time
    return total_time
 def create_async_task(message_type):
    base_url = "https://exampleopenaiendpoint-production.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001"
    if message_type == "text":
        messages = [{"role": "user", "content": "hi"}]
    else:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
                        },
                    },
                ],
            }
        ]
    completion_args = {
        "model": "vertex_ai/gemini",
        "messages": messages,
        "max_tokens": 5,
        "temperature": 0.7,
        "timeout": 10,
        "api_base": base_url,
    }
    return asyncio.create_task(litellm.acompletion(**completion_args))
--- a/tests/load_tests/vertex_key.json
+++ b/tests/load_tests/vertex_key.json
@ -0,0 +1,13 @@
 {
  "type": "service_account",
  "project_id": "adroit-crow-413218",
  "private_key_id": "",
  "private_key": "",
  "client_email": "test-adroit-crow@adroit-crow-413218.iam.gserviceaccount.com",
  "client_id": "104886546564708740969",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test-adroit-crow%40adroit-crow-413218.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
 }