litellm-mirror/litellm/tests/test_cache.py

import sys, os
import traceback
from dotenv import load_dotenv
load_dotenv()
import os
sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
import pytest
import litellm
from litellm import embedding, completion

# set cache to True
litellm.cache = True
litellm.cache_similarity_threshold = 0.5

user_message = "Hello, whats the weather in San Francisco??"
messages = [{ "content": user_message,"role": "user"}]

def test_completion_gpt():
    try:
        # in this test make the same call twice, measure the response time
        # the 2nd response time should be less than half of the first, ensuring that the cache is working
        import time
        start = time.time()
        response = completion(model="gpt-4", messages=messages)
        end = time.time()
        first_call_time = end-start
        print(f"first call: {first_call_time}")

        start = time.time()
        response = completion(model="gpt-4", messages=messages)
        end = time.time()
        second_call_time = end-start
        print(f"second call: {second_call_time}")

        if second_call_time > first_call_time/2:
            # the 2nd call should be less than half of the first call
            pytest.fail(f"Cache is not working")
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")