diff --git a/litellm/caching.py b/litellm/caching.py index 4f22bf2ef..a9f149575 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -30,6 +30,7 @@ class RedisCache(): # cached_response is in `b{} convert it to ModelResponse cached_response = cached_response.decode("utf-8") # Convert bytes to string cached_response = json.loads(cached_response) # Convert string to dictionary + cached_response['cache'] = True # set cache-hit flag to True return cached_response diff --git a/litellm/main.py b/litellm/main.py index 008d37362..70be3df0c 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -114,6 +114,7 @@ def completion( top_k=40, request_timeout=0, # unused var for old version of OpenAI API fallbacks=[], + caching = False, ) -> ModelResponse: args = locals() try: diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index a1d97c1e6..5cc455c08 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -12,7 +12,7 @@ import pytest import litellm from litellm import embedding, completion from litellm.caching import Cache -litellm.set_verbose=True +# litellm.set_verbose=True messages = [{"role": "user", "content": "who is ishaan Github? "}] # comment @@ -270,5 +270,26 @@ def test_redis_cache_completion(): # test_redis_cache_completion() +# redis cache with custom keys +def custom_get_cache_key(*args, **kwargs): + # return key to use for your cache: + key = kwargs.get("model", "") + str(kwargs.get("messages", "")) + str(kwargs.get("temperature", "")) + str(kwargs.get("logit_bias", "")) + print("key for cache", key) + return key + +def test_custom_redis_cache_with_key(): + messages = [{"role": "user", "content": "how many stars does litellm have? "}] + litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD']) + litellm.cache.get_cache_key = custom_get_cache_key + + response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True) + response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True) + response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=False) + + print(f"response1: {response1}") + print(f"response2: {response2}") + print(f"response3: {response3}") + +# test_custom_redis_cache_with_key() diff --git a/litellm/utils.py b/litellm/utils.py index cb3424cfc..7c7ff2bb5 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -520,12 +520,13 @@ def client(original_function): if (litellm.caching or litellm.caching_with_models) and litellm.cache is None: litellm.cache = Cache() - # checking cache - if (litellm.cache != None or litellm.caching or litellm.caching_with_models): - print_verbose(f"LiteLLM: Checking Cache") - cached_result = litellm.cache.get_cache(*args, **kwargs) - if cached_result != None: - return cached_result + if kwargs.get("caching", False): # allow users to control returning cached responses from the completion function + # checking cache + if (litellm.cache != None or litellm.caching or litellm.caching_with_models): + print_verbose(f"LiteLLM: Checking Cache") + cached_result = litellm.cache.get_cache(*args, **kwargs) + if cached_result != None: + return cached_result # MODEL CALL result = original_function(*args, **kwargs)