From d4a799a3ca97ce84f8491a8b372eaea7651292e5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 01/21] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a..e1ef95dc3 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ class InMemoryCache(BaseCache):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ class RedisCache(BaseCache):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ class DualCache(BaseCache):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80..32904ab78 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From 1b39454a08539ca47aeec72bb6b48a9d4e366cb8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 02/21] (feat) working - sync semantic caching

---
 litellm/caching.py | 227 ++++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 75 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc3..0a1046f0d 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
 
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
 
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
     def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
-        try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+        import numpy as np
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            # create embedding response
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        # Add more data
+        keys = self.index.load(new_data)
 
-            # print("Stored Redis Value: ", stored_redis_value)
-
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ class Cache:
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ class Cache:
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ class Cache:
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ class Cache:
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From cf4bd1cf4ef8f588f3b8404bcee864636b9510ba Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 03/21] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab78..3ac812cf3 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From 81f8ac00b2ed1d95a52affb13ada3311763053e4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 04/21] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf3..4b47614cc 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From ccc94128d302719bb43adff58c3cdb66f3a62a9b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 05/21] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d..877f935fa 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache):
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From 76def20ffee3aefe68552b77b0564e952b081a87 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 06/21] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fa..ad37f2077 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache):
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache):
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ class Cache:
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ class Cache:
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ class Cache:
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From a125ffe190fd12fe7dd34f842bd88eac750cfff1 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 07/21] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614cc..a1a42ff65 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From 6249a970980b889103b46a2d41fba8d80c6f5d60 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 08/21] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077..a7958d074 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache):
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From a3b1e3bc843fe0443e34decb6cea4993d5ee635c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 09/21] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 8df027b87..d0aded4e5 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2534,6 +2534,14 @@ def client(original_function):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From 70a895329ef8f77bc3c3d501ba71c9ef1f8ef53f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 10/21] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 874049a75..41c3b4182 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,7 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From c4e73768cf9159a19682a8ced7537b2cd00db5eb Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 11/21] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff..4730fc28b 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c9bd0e511..c58eda09a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.1.0 # for vertex ai calls

From 751fb1af892d99f10dc5b8f4694d0655a0bec7d6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 12/21] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074..133d1db6d 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ class Cache:
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From 05f379234dd104e75ed53172af2cb8c55dc4d3a2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 13/21] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6d..6bf53ea45 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache):
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache):
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ class Cache:
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ class Cache:
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From a1fc1e49c734514c7378e47932297a812c552b58 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 14/21] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b4182..326544f41 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From bdc209183804ee48b4259be827d02c788782d70d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 15/21] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed3..3f2687824 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From 2732c47b70c78139a349f2aa5513d317b2e24d9f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 16/21] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 6 ++++--
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 0501ec746..70e602e99 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1135,7 +1135,7 @@ class ProxyConfig:
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1162,6 +1162,9 @@ class ProxyConfig:
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
@@ -4067,7 +4070,6 @@ def _has_user_setup_sso():
 async def shutdown_event():
     global prisma_client, master_key, user_custom_auth, user_custom_key_generate
     if prisma_client:
-
         verbose_proxy_logger.debug("Disconnecting from Prisma")
         await prisma_client.disconnect()
 
diff --git a/requirements.txt b/requirements.txt
index c58eda09a..6b82c993a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.1.0 # for vertex ai calls

From c8a83bb745a69810b2932bce6d42e0fe34406819 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 17/21] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff65..cc18dda16 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From 93504915d7bcde97137c03038e3d11dcf1a32c0c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 18/21] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087..7b21d35b6 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From 54c920c299dc913c488b664087d5a167d0097c99 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 19/21] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6..75e1db955 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f2687824..d5b589e5c 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From 1afdf5cf365b5e6b54465c6457e096b8056c1798 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 20/21] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda16..96fd8eb9d 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From 8175fb4deb27e35f1c0575cf68928147a459ebc4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 21/21] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d..6cb5b974a 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True