caching updates

2025-04-26 03:04:13 +00:00 · 2023-09-08 18:06:46 -07:00 · 2023-09-08 18:06:46 -07:00 · 0ab62f13e8
commit 0ab62f13e8
parent 47c1f57a24
4 changed files with 31 additions and 7 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -30,6 +30,7 @@ class RedisCache():
            # cached_response is in `b{} convert it to ModelResponse
            cached_response = cached_response.decode("utf-8")  # Convert bytes to string
            cached_response = json.loads(cached_response)  # Convert string to dictionary
            cached_response['cache'] = True # set cache-hit flag to True
            return cached_response
--- a/litellm/main.py
+++ b/litellm/main.py
@ -114,6 +114,7 @@ def completion(
    top_k=40,
    request_timeout=0,  # unused var for old version of OpenAI API
    fallbacks=[],
    caching = False,
 ) -> ModelResponse:
    args = locals()
    try:
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -12,7 +12,7 @@ import pytest
 import litellm
 from litellm import embedding, completion
 from litellm.caching import Cache
-litellm.set_verbose=True
+# litellm.set_verbose=True
 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 # comment
@ -270,5 +270,26 @@ def test_redis_cache_completion():
 # test_redis_cache_completion()
 # redis cache with custom keys
 def custom_get_cache_key(*args, **kwargs):
    # return key to use for your cache:
    key = kwargs.get("model", "") + str(kwargs.get("messages", "")) + str(kwargs.get("temperature", "")) + str(kwargs.get("logit_bias", ""))
    print("key for cache", key)
    return key
 def test_custom_redis_cache_with_key():
    messages = [{"role": "user", "content": "how many stars does litellm have?  "}]
    litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
    litellm.cache.get_cache_key = custom_get_cache_key
    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True)
    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True)
    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=False)
    print(f"response1: {response1}")
    print(f"response2: {response2}")
    print(f"response3: {response3}")
 # test_custom_redis_cache_with_key()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -520,12 +520,13 @@ def client(original_function):
            if (litellm.caching or litellm.caching_with_models) and litellm.cache is None:
                litellm.cache = Cache() 
-            # checking cache
+            if kwargs.get("caching", False): # allow users to control returning cached responses from the completion function
-            if (litellm.cache != None or litellm.caching or litellm.caching_with_models):
+                # checking cache
-                print_verbose(f"LiteLLM: Checking Cache")
+                if (litellm.cache != None or litellm.caching or litellm.caching_with_models):
-                cached_result = litellm.cache.get_cache(*args, **kwargs)
+                    print_verbose(f"LiteLLM: Checking Cache")
-                if cached_result != None:
+                    cached_result = litellm.cache.get_cache(*args, **kwargs)
-                    return cached_result
+                    if cached_result != None:
                        return cached_result
            # MODEL CALL
            result = original_function(*args, **kwargs)