diff --git a/litellm/caching.py b/litellm/caching.py
index 4f22bf2ef..a9f149575 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -30,6 +30,7 @@ class RedisCache():
             # cached_response is in `b{} convert it to ModelResponse
             cached_response = cached_response.decode("utf-8")  # Convert bytes to string
             cached_response = json.loads(cached_response)  # Convert string to dictionary
+            cached_response['cache'] = True # set cache-hit flag to True
             return cached_response
 
 
diff --git a/litellm/main.py b/litellm/main.py
index 008d37362..70be3df0c 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -114,6 +114,7 @@ def completion(
     top_k=40,
     request_timeout=0,  # unused var for old version of OpenAI API
     fallbacks=[],
+    caching = False,
 ) -> ModelResponse:
     args = locals()
     try:
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1d97c1e6..5cc455c08 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -12,7 +12,7 @@ import pytest
 import litellm
 from litellm import embedding, completion
 from litellm.caching import Cache
-litellm.set_verbose=True
+# litellm.set_verbose=True
 
 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 # comment
@@ -270,5 +270,26 @@ def test_redis_cache_completion():
 
 # test_redis_cache_completion()
 
+# redis cache with custom keys
+def custom_get_cache_key(*args, **kwargs):
+    # return key to use for your cache:
+    key = kwargs.get("model", "") + str(kwargs.get("messages", "")) + str(kwargs.get("temperature", "")) + str(kwargs.get("logit_bias", ""))
+    print("key for cache", key)
+    return key
+
+def test_custom_redis_cache_with_key():
+    messages = [{"role": "user", "content": "how many stars does litellm have?  "}]
+    litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
+    litellm.cache.get_cache_key = custom_get_cache_key
+
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True)
+    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True)
+    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=False)
+    
+    print(f"response1: {response1}")
+    print(f"response2: {response2}")
+    print(f"response3: {response3}")
+
+# test_custom_redis_cache_with_key()
 
 
diff --git a/litellm/utils.py b/litellm/utils.py
index cb3424cfc..7c7ff2bb5 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -520,12 +520,13 @@ def client(original_function):
             if (litellm.caching or litellm.caching_with_models) and litellm.cache is None:
                 litellm.cache = Cache() 
 
-            # checking cache
-            if (litellm.cache != None or litellm.caching or litellm.caching_with_models):
-                print_verbose(f"LiteLLM: Checking Cache")
-                cached_result = litellm.cache.get_cache(*args, **kwargs)
-                if cached_result != None:
-                    return cached_result
+            if kwargs.get("caching", False): # allow users to control returning cached responses from the completion function
+                # checking cache
+                if (litellm.cache != None or litellm.caching or litellm.caching_with_models):
+                    print_verbose(f"LiteLLM: Checking Cache")
+                    cached_result = litellm.cache.get_cache(*args, **kwargs)
+                    if cached_result != None:
+                        return cached_result
 
             # MODEL CALL
             result = original_function(*args, **kwargs)