import sys, os import traceback from dotenv import load_dotenv load_dotenv() import os sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path import pytest import litellm from litellm import embedding, completion # set cache to True litellm.cache = True litellm.cache_similarity_threshold = 0.5 user_message = "Hello, whats the weather in San Francisco??" messages = [{ "content": user_message,"role": "user"}] def test_completion_gpt(): try: # in this test make the same call twice, measure the response time # the 2nd response time should be less than half of the first, ensuring that the cache is working import time start = time.time() response = completion(model="gpt-4", messages=messages) end = time.time() first_call_time = end-start print(f"first call: {first_call_time}") start = time.time() response = completion(model="gpt-4", messages=messages) end = time.time() second_call_time = end-start print(f"second call: {second_call_time}") if second_call_time > first_call_time/2: # the 2nd call should be less than half of the first call pytest.fail(f"Cache is not working") # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}")