From d4a799a3ca97ce84f8491a8b372eaea7651292e5 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 12:28:21 -0800 Subject: [PATCH 01/21] (feat )add semantic cache --- litellm/caching.py | 102 +++++++++++++++++++++++++++++++++- litellm/tests/test_caching.py | 25 +++++++++ 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index d0721fe9a..e1ef95dc3 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -83,7 +83,6 @@ class InMemoryCache(BaseCache): self.cache_dict.clear() self.ttl_dict.clear() - async def disconnect(self): pass @@ -217,7 +216,6 @@ class RedisCache(BaseCache): def flush_cache(self): self.redis_client.flushall() - async def disconnect(self): pass @@ -225,6 +223,102 @@ class RedisCache(BaseCache): self.redis_client.delete(key) +class RedisSemanticCache(RedisCache): + def __init__(self, host, port, password, **kwargs): + super().__init__() + + # from redis.commands.search.field import TagField, TextField, NumericField, VectorField + # from redis.commands.search.indexDefinition import IndexDefinition, IndexType + # from redis.commands.search.query import Query + + # INDEX_NAME = 'idx:litellm_completion_response_vss' + # DOC_PREFIX = 'bikes:' + + # try: + # # check to see if index exists + # client.ft(INDEX_NAME).info() + # print('Index already exists!') + # except: + # # schema + # schema = ( + # TextField('$.model', no_stem=True, as_name='model'), + # TextField('$.brand', no_stem=True, as_name='brand'), + # NumericField('$.price', as_name='price'), + # TagField('$.type', as_name='type'), + # TextField('$.description', as_name='description'), + # VectorField('$.description_embeddings', + # 'FLAT', { + # 'TYPE': 'FLOAT32', + # 'DIM': VECTOR_DIMENSION, + # 'DISTANCE_METRIC': 'COSINE', + # }, as_name='vector' + # ), + # ) + + # # index Definition + # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) + + # # create Index + # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + + def set_cache(self, key, value, **kwargs): + ttl = kwargs.get("ttl", None) + print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") + try: + # get text response + # print("in redis semantic cache: value: ", value) + llm_response = value["response"] + + # if llm_response is a string, convert it to a dictionary + if isinstance(llm_response, str): + llm_response = json.loads(llm_response) + + # print("converted llm_response: ", llm_response) + response = llm_response["choices"][0]["message"]["content"] + + # create embedding response + + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=response, + cache={"no-store": True}, + ) + + raw_embedding = embedding_response["data"][0]["embedding"] + raw_embedding_dimension = len(raw_embedding) + + # print("embedding: ", raw_embedding) + key = "litellm-semantic:" + key + self.redis_client.json().set( + name=key, + path="$", + obj=json.dumps( + { + "response": response, + "embedding": raw_embedding, + "dimension": raw_embedding_dimension, + } + ), + ) + + stored_redis_value = self.redis_client.json().get(name=key) + + # print("Stored Redis Value: ", stored_redis_value) + + except Exception as e: + # print("Error occurred: ", e) + # NON blocking - notify users Redis is throwing an exception + logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + + def get_cache(self, key, **kwargs): + pass + + async def async_set_cache(self, key, value, **kwargs): + pass + + async def async_get_cache(self, key, **kwargs): + pass + class S3Cache(BaseCache): def __init__( @@ -429,7 +523,7 @@ class DualCache(BaseCache): class Cache: def __init__( self, - type: Optional[Literal["local", "redis", "s3"]] = "local", + type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local", host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, @@ -468,6 +562,8 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) + elif type == "redis-semantic": + self.cache = RedisSemanticCache(host, port, password, **kwargs) elif type == "local": self.cache = InMemoryCache() elif type == "s3": diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 468ab6f80..32904ab78 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -987,3 +987,28 @@ def test_cache_context_managers(): # test_cache_context_managers() + + +def test_redis_semantic_cache_completion(): + litellm.set_verbose = False + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + messages = [ + {"role": "user", "content": f"write a one sentence poem about: {random_number}"} + ] + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + ) + print("test2 for Redis Caching - non streaming") + response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) + # response2 = completion( + # model="gpt-3.5-turbo", messages=messages,max_tokens=20 + # ) + + +# test_redis_cache_completion() From 1b39454a08539ca47aeec72bb6b48a9d4e366cb8 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:12 -0800 Subject: [PATCH 02/21] (feat) working - sync semantic caching --- litellm/caching.py | 227 ++++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 75 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index e1ef95dc3..0a1046f0d 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -223,94 +223,161 @@ class RedisCache(BaseCache): self.redis_client.delete(key) -class RedisSemanticCache(RedisCache): - def __init__(self, host, port, password, **kwargs): - super().__init__() +class RedisSemanticCache(BaseCache): + def __init__( + self, + host=None, + port=None, + password=None, + redis_url=None, + similarity_threshold=None, + **kwargs, + ): + from redisvl.index import SearchIndex + from redisvl.query import VectorQuery - # from redis.commands.search.field import TagField, TextField, NumericField, VectorField - # from redis.commands.search.indexDefinition import IndexDefinition, IndexType - # from redis.commands.search.query import Query + print_verbose( + "redis semantic-cache initializing INDEX - litellm_semantic_cache_index" + ) + if similarity_threshold is None: + raise Exception("similarity_threshold must be provided, passed None") + self.similarity_threshold = similarity_threshold + schema = { + "index": { + "name": "litellm_semantic_cache_index", + "prefix": "litellm", + "storage_type": "hash", + }, + "fields": { + "text": [{"name": "response"}], + "text": [{"name": "prompt"}], + "vector": [ + { + "name": "litellm_embedding", + "dims": 1536, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + } + ], + }, + } + self.index = SearchIndex.from_dict(schema) + if redis_url is None: + # if no url passed, check if host, port and password are passed, if not raise an Exception + if host is None or port is None or password is None: + raise Exception(f"Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port + print_verbose(f"redis semantic-cache redis_url: {redis_url}") + self.index.connect(redis_url=redis_url) + self.index.create(overwrite=False) # don't overwrite existing index - # INDEX_NAME = 'idx:litellm_completion_response_vss' - # DOC_PREFIX = 'bikes:' + def _get_cache_logic(self, cached_response: Any): + """ + Common 'get_cache_logic' across sync + async redis client implementations + """ + if cached_response is None: + return cached_response - # try: - # # check to see if index exists - # client.ft(INDEX_NAME).info() - # print('Index already exists!') - # except: - # # schema - # schema = ( - # TextField('$.model', no_stem=True, as_name='model'), - # TextField('$.brand', no_stem=True, as_name='brand'), - # NumericField('$.price', as_name='price'), - # TagField('$.type', as_name='type'), - # TextField('$.description', as_name='description'), - # VectorField('$.description_embeddings', - # 'FLAT', { - # 'TYPE': 'FLOAT32', - # 'DIM': VECTOR_DIMENSION, - # 'DISTANCE_METRIC': 'COSINE', - # }, as_name='vector' - # ), - # ) + # check if cached_response is bytes + if isinstance(cached_response, bytes): + cached_response = cached_response.decode("utf-8") - # # index Definition - # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) - - # # create Index - # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + try: + cached_response = json.loads( + cached_response + ) # Convert string to dictionary + except: + cached_response = ast.literal_eval(cached_response) + return cached_response def set_cache(self, key, value, **kwargs): - ttl = kwargs.get("ttl", None) - print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") - try: - # get text response - # print("in redis semantic cache: value: ", value) - llm_response = value["response"] + import numpy as np - # if llm_response is a string, convert it to a dictionary - if isinstance(llm_response, str): - llm_response = json.loads(llm_response) + print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}") - # print("converted llm_response: ", llm_response) - response = llm_response["choices"][0]["message"]["content"] + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] - # create embedding response + # create an embedding for prompt + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) - embedding_response = litellm.embedding( - model="text-embedding-ada-002", - input=response, - cache={"no-store": True}, - ) + # get the embedding + embedding = embedding_response["data"][0]["embedding"] - raw_embedding = embedding_response["data"][0]["embedding"] - raw_embedding_dimension = len(raw_embedding) + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) - # print("embedding: ", raw_embedding) - key = "litellm-semantic:" + key - self.redis_client.json().set( - name=key, - path="$", - obj=json.dumps( - { - "response": response, - "embedding": raw_embedding, - "dimension": raw_embedding_dimension, - } - ), - ) + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] - stored_redis_value = self.redis_client.json().get(name=key) + # Add more data + keys = self.index.load(new_data) - # print("Stored Redis Value: ", stored_redis_value) - - except Exception as e: - # print("Error occurred: ", e) - # NON blocking - notify users Redis is throwing an exception - logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + pass def get_cache(self, key, **kwargs): + print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + num_results=1, + ) + + results = self.index.query(query) + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None + pass async def async_set_cache(self, key, value, **kwargs): @@ -527,6 +594,7 @@ class Cache: host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, + similarity_threshold: Optional[float] = None, supported_call_types: Optional[ List[Literal["completion", "acompletion", "embedding", "aembedding"]] ] = ["completion", "acompletion", "embedding", "aembedding"], @@ -547,10 +615,12 @@ class Cache: Initializes the cache based on the given type. Args: - type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local". + type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local". host (str, optional): The host address for the Redis cache. Required if type is "redis". port (int, optional): The port number for the Redis cache. Required if type is "redis". password (str, optional): The password for the Redis cache. Required if type is "redis". + similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic" + supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types. **kwargs: Additional keyword arguments for redis.Redis() cache @@ -563,7 +633,13 @@ class Cache: if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) elif type == "redis-semantic": - self.cache = RedisSemanticCache(host, port, password, **kwargs) + self.cache = RedisSemanticCache( + host, + port, + password, + similarity_threshold=similarity_threshold, + **kwargs, + ) elif type == "local": self.cache = InMemoryCache() elif type == "s3": @@ -743,6 +819,7 @@ class Cache: The cached result if it exists, otherwise None. """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -752,7 +829,7 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = self.cache.get_cache(cache_key) + cached_result = self.cache.get_cache(cache_key, messages=messages) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From cf4bd1cf4ef8f588f3b8404bcee864636b9510ba Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:32 -0800 Subject: [PATCH 03/21] (test) semantic cache --- litellm/tests/test_caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 32904ab78..3ac812cf3 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -990,7 +990,7 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): - litellm.set_verbose = False + litellm.set_verbose = True random_number = random.randint( 1, 100000 @@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion(): host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.5, ) print("test2 for Redis Caching - non streaming") response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) From 81f8ac00b2ed1d95a52affb13ada3311763053e4 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:22:50 -0800 Subject: [PATCH 04/21] (test) semantic caching --- litellm/tests/test_caching.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 3ac812cf3..4b47614cc 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 ) # add a random number to ensure it's always adding / reading from cache - messages = [ - {"role": "user", "content": f"write a one sentence poem about: {random_number}"} - ] + + print("testing semantic caching") litellm.cache = Cache( type="redis-semantic", host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.5, + similarity_threshold=0.8, ) - print("test2 for Redis Caching - non streaming") - response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) - # response2 = completion( - # model="gpt-3.5-turbo", messages=messages,max_tokens=20 - # ) + response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" + # assert response1.choices[0].message == 1 # test_redis_cache_completion() From ccc94128d302719bb43adff58c3cdb66f3a62a9b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:25:22 -0800 Subject: [PATCH 05/21] (fix) semantic cache --- litellm/caching.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index 0a1046f0d..877f935fa 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache): redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") self.index.connect(redis_url=redis_url) - self.index.create(overwrite=False) # don't overwrite existing index + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") def _get_cache_logic(self, cached_response: Any): """ From 76def20ffee3aefe68552b77b0564e952b081a87 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:13:12 -0800 Subject: [PATCH 06/21] (feat) RedisSemanticCache - async --- litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 877f935fa..ad37f2077 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache): password=None, redis_url=None, similarity_threshold=None, + use_async=False, **kwargs, ): from redisvl.index import SearchIndex @@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache): ], }, } - self.index = SearchIndex.from_dict(schema) if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: raise Exception(f"Redis host, port, and password must be provided") redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") - self.index.connect(redis_url=redis_url) + if use_async == False: + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url) + elif use_async == True: + schema["index"]["name"] = "litellm_semantic_cache_index_async" + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url, use_async=True) try: self.index.create(overwrite=False) # don't overwrite existing index except Exception as e: @@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache): # Add more data keys = self.index.load(new_data) - pass + return def get_cache(self, key, **kwargs): - print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np @@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache): ) results = self.index.query(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None vector_distance = results[0]["vector_distance"] vector_distance = float(vector_distance) @@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache): pass async def async_set_cache(self, key, value, **kwargs): - pass + import numpy as np + + print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") + + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + # create an embedding for prompt + + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) + + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] + + # Add more data + keys = await self.index.aload(new_data) + return async def async_get_cache(self, key, **kwargs): + print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + ) + results = await self.index.aquery(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None pass @@ -612,6 +707,7 @@ class Cache: s3_aws_secret_access_key: Optional[str] = None, s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, + redis_semantic_cache_use_async=False, **kwargs, ): """ @@ -641,6 +737,7 @@ class Cache: port, password, similarity_threshold=similarity_threshold, + use_async=redis_semantic_cache_use_async, **kwargs, ) elif type == "local": @@ -847,6 +944,7 @@ class Cache: Used for embedding calls in async wrapper """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -856,7 +954,9 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = await self.cache.async_get_cache(cache_key) + cached_result = await self.cache.async_get_cache( + cache_key, messages=messages + ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From a125ffe190fd12fe7dd34f842bd88eac750cfff1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:14:54 -0800 Subject: [PATCH 07/21] (test) async semantic cache --- litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 4b47614cc..a1a42ff65 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -991,6 +991,9 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) random_number = random.randint( 1, 100000 @@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() + + +@pytest.mark.asyncio +async def test_redis_semantic_cache_acompletion(): + litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + + print("testing semantic caching") + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_use_async=True, + ) + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" From 6249a970980b889103b46a2d41fba8d80c6f5d60 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:52:57 -0800 Subject: [PATCH 08/21] (feat) working semantic-cache on litellm proxy --- litellm/caching.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index ad37f2077..a7958d074 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache): if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: - raise Exception(f"Redis host, port, and password must be provided") + # try checking env for host, port and password + import os + + host = os.getenv("REDIS_HOST") + port = os.getenv("REDIS_PORT") + password = os.getenv("REDIS_PASSWORD") + if host is None or port is None or password is None: + raise Exception("Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") if use_async == False: self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url) + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") elif use_async == True: schema["index"]["name"] = "litellm_semantic_cache_index_async" self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url, use_async=True) - try: - self.index.create(overwrite=False) # don't overwrite existing index - except Exception as e: - print_verbose(f"Got exception creating semantic cache index: {str(e)}") + # def _get_cache_logic(self, cached_response: Any): """ Common 'get_cache_logic' across sync + async redis client implementations @@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + try: + await self.index.acreate(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") # get the prompt From a3b1e3bc843fe0443e34decb6cea4993d5ee635c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:54:36 -0800 Subject: [PATCH 09/21] (feat) redis-semantic cache --- litellm/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 8df027b87..d0aded4e5 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger from .proxy._types import KeyManagementSystem from openai import OpenAIError as OriginalError from openai._models import BaseModel as OpenAIObject -from .caching import S3Cache +from .caching import S3Cache, RedisSemanticCache from .exceptions import ( AuthenticationError, BadRequestError, @@ -2534,6 +2534,14 @@ def client(original_function): ): if len(cached_result) == 1 and cached_result[0] is None: cached_result = None + elif isinstance(litellm.cache.cache, RedisSemanticCache): + preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) + kwargs[ + "preset_cache_key" + ] = preset_cache_key # for streaming calls, we need to pass the preset_cache_key + cached_result = await litellm.cache.async_get_cache( + *args, **kwargs + ) else: preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) kwargs[ From 70a895329ef8f77bc3c3d501ba71c9ef1f8ef53f Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:55:25 -0800 Subject: [PATCH 10/21] (feat) working semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 874049a75..41c3b4182 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -73,7 +73,12 @@ litellm_settings: max_budget: 1.5000 models: ["azure-gpt-3.5"] duration: None - # cache: True + cache: True # set cache responses to True + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 + redis_semantic_cache_use_async: True + # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From c4e73768cf9159a19682a8ced7537b2cd00db5eb Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 11/21] (fix) add redisvl==0.0.7 --- .circleci/requirements.txt | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index 85b576bff..4730fc28b 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -10,4 +10,5 @@ anthropic boto3 orjson pydantic -google-cloud-aiplatform \ No newline at end of file +google-cloud-aiplatform +redisvl==0.0.7 # semantic caching \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c9bd0e511..c58eda09a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching +redisvl==0.0.7 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.1.0 # for vertex ai calls From 751fb1af892d99f10dc5b8f4694d0655a0bec7d6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:31:57 -0800 Subject: [PATCH 12/21] (feat) log semantic_sim to langfuse --- litellm/caching.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index a7958d074..133d1db6d 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache): ) results = await self.index.aquery(query) if results == None: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None if isinstance(results, list): if len(results) == 0: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None vector_distance = results[0]["vector_distance"] @@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache): print_verbose( f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" ) + + # update kwargs["metadata"] with similarity, don't rewrite the original metadata + kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity + if similarity > self.similarity_threshold: # cache hit ! cached_value = results[0]["response"] @@ -968,7 +974,7 @@ class Cache: "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) cached_result = await self.cache.async_get_cache( - cache_key, messages=messages + cache_key, *args, **kwargs ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age From 05f379234dd104e75ed53172af2cb8c55dc4d3a2 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:22:02 -0800 Subject: [PATCH 13/21] allow setting redis_semantic cache_embedding model --- litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 133d1db6d..6bf53ea45 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache): redis_url=None, similarity_threshold=None, use_async=False, + embedding_model="text-embedding-ada-002", **kwargs, ): from redisvl.index import SearchIndex @@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache): if similarity_threshold is None: raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold + self.embedding_model = embedding_model schema = { "index": { "name": "litellm_semantic_cache_index", @@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache): # create an embedding for prompt embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache): # convert to embedding embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list try: await self.index.acreate(overwrite=False) # don't overwrite existing index @@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] # create an embedding for prompt - - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache): print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list # query @@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] - # convert to embedding - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -727,6 +755,7 @@ class Cache: s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, redis_semantic_cache_use_async=False, + redis_semantic_cache_embedding_model="text-embedding-ada-002", **kwargs, ): """ @@ -757,6 +786,7 @@ class Cache: password, similarity_threshold=similarity_threshold, use_async=redis_semantic_cache_use_async, + embedding_model=redis_semantic_cache_embedding_model, **kwargs, ) elif type == "local": From a1fc1e49c734514c7378e47932297a812c552b58 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:27:33 -0800 Subject: [PATCH 14/21] (fix) use semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 41c3b4182..326544f41 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -77,7 +77,7 @@ litellm_settings: cache_params: type: "redis-semantic" similarity_threshold: 0.8 - redis_semantic_cache_use_async: True + redis_semantic_cache_embedding_model: azure-embedding-model # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From bdc209183804ee48b4259be827d02c788782d70d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:32:07 -0800 Subject: [PATCH 15/21] (docs) using semantic caching on proxy --- docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 03bb9fed3..3f2687824 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -9,7 +9,7 @@ LiteLLM supports: - Redis Cache - s3 Bucket Cache -## Quick Start - Redis, s3 Cache +## Quick Start - Redis, s3 Cache, Semantic Cache @@ -84,6 +84,56 @@ litellm_settings: $ litellm --config /path/to/config.yaml ``` + + + + +Caching can be enabled by adding the `cache` key in the `config.yaml` + +### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: azure-embedding-model + litellm_params: + model: azure/azure-embedding-model + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 # similarity threshold for semantic cache + redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list +``` + +### Step 2: Add Redis Credentials to .env +Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching. + + ```shell + REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' + ## OR ## + REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' + REDIS_PORT = "" # REDIS_PORT='18841' + REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' + ``` + +**Additional kwargs** +You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +```shell +REDIS_ = "" +``` + +### Step 3: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + From 2732c47b70c78139a349f2aa5513d317b2e24d9f Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 16/21] (feat) redis-semantic cache on proxy --- litellm/proxy/proxy_server.py | 6 ++++-- requirements.txt | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 0501ec746..70e602e99 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1135,7 +1135,7 @@ class ProxyConfig: verbose_proxy_logger.debug(f"passed cache type={cache_type}") - if cache_type == "redis": + if cache_type == "redis" or cache_type == "redis-semantic": cache_host = litellm.get_secret("REDIS_HOST", None) cache_port = litellm.get_secret("REDIS_PORT", None) cache_password = litellm.get_secret("REDIS_PASSWORD", None) @@ -1162,6 +1162,9 @@ class ProxyConfig: f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}" ) print() # noqa + if cache_type == "redis-semantic": + # by default this should always be async + cache_params.update({"redis_semantic_cache_use_async": True}) # users can pass os.environ/ variables on the proxy - we should read them from the env for key, value in cache_params.items(): @@ -4067,7 +4070,6 @@ def _has_user_setup_sso(): async def shutdown_event(): global prisma_client, master_key, user_custom_auth, user_custom_key_generate if prisma_client: - verbose_proxy_logger.debug("Disconnecting from Prisma") await prisma_client.disconnect() diff --git a/requirements.txt b/requirements.txt index c58eda09a..6b82c993a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.1.0 # for vertex ai calls From c8a83bb745a69810b2932bce6d42e0fe34406819 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:39:44 -0800 Subject: [PATCH 17/21] (fix) test-semantic caching --- litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index a1a42ff65..cc18dda16 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion(): ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" - # assert response1.choices[0].message == 1 + random_number = random.randint(1, 100000) + + response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response2: {response1}") + assert response1.id == response2.id # test_redis_cache_completion() @@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion(): "content": f"write a one sentence poem about: {random_number}", } ], - max_tokens=20, + max_tokens=5, ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" + random_number = random.randint(1, 100000) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=5, + ) + print(f"response2: {response2}") + assert response1.id == response2.id From 93504915d7bcde97137c03038e3d11dcf1a32c0c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:53:28 -0800 Subject: [PATCH 18/21] (docs) redis cache --- docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 8a580f087..7b21d35b6 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -1,11 +1,11 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Caching - In-Memory, Redis, s3 +# Caching - In-Memory, Redis, s3, Redis Semantic Cache [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py) -## Initialize Cache - In Memory, Redis, s3 Bucket +## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache @@ -18,7 +18,7 @@ pip install redis ``` For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ -### Quick Start + ```python import litellm from litellm import completion @@ -55,7 +55,7 @@ Set AWS environment variables AWS_ACCESS_KEY_ID = "AKI*******" AWS_SECRET_ACCESS_KEY = "WOl*****" ``` -### Quick Start + ```python import litellm from litellm import completion @@ -80,6 +80,66 @@ response2 = completion( + + +Install redis +```shell +pip install redisvl==0.0.7 +``` + +For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ + +```python +import litellm +from litellm import completion +from litellm.caching import Cache + +random_number = random.randint( + 1, 100000 +) # add a random number to ensure it's always adding / reading from cache + +print("testing semantic caching") +litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here +) +response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response1: {response1}") + +random_number = random.randint(1, 100000) + +response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response2: {response1}") +assert response1.id == response2.id +# response1 == response2, response 1 is cached +``` + + + + + ### Quick Start From 54c920c299dc913c488b664087d5a167d0097c99 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:54:55 -0800 Subject: [PATCH 19/21] (docs) litellm semantic caching --- docs/my-website/docs/caching/redis_cache.md | 2 +- docs/my-website/docs/proxy/caching.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 7b21d35b6..75e1db955 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -104,7 +104,7 @@ litellm.cache = Cache( host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.8, + similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here ) response1 = completion( diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 3f2687824..d5b589e5c 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -7,6 +7,7 @@ Cache LLM Responses LiteLLM supports: - In Memory Cache - Redis Cache +- Redis Semantic Cache - s3 Bucket Cache ## Quick Start - Redis, s3 Cache, Semantic Cache From 1afdf5cf365b5e6b54465c6457e096b8056c1798 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:55:15 -0800 Subject: [PATCH 20/21] (fix) semantic caching --- litellm/tests/test_caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index cc18dda16..96fd8eb9d 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion(): port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", ) response1 = completion( model="gpt-3.5-turbo", From 8175fb4deb27e35f1c0575cf68928147a459ebc4 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:04:19 -0800 Subject: [PATCH 21/21] (fix) mark semantic caching as beta test --- litellm/tests/test_caching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 96fd8eb9d..6cb5b974a 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -989,6 +989,7 @@ def test_cache_context_managers(): # test_cache_context_managers() +@pytest.mark.skip(reason="beta test - new redis semantic cache") def test_redis_semantic_cache_completion(): litellm.set_verbose = True import logging @@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() +@pytest.mark.skip(reason="beta test - new redis semantic cache") @pytest.mark.asyncio async def test_redis_semantic_cache_acompletion(): litellm.set_verbose = True