From 1689d5790fc176dac9877a5b46b5e63e53d79bcc Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 12:28:21 -0800 Subject: [PATCH 001/148] (feat )add semantic cache --- litellm/caching.py | 102 +++++++++++++++++++++++++++++++++- litellm/tests/test_caching.py | 25 +++++++++ 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index d0721fe9a9..e1ef95dc34 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -83,7 +83,6 @@ class InMemoryCache(BaseCache): self.cache_dict.clear() self.ttl_dict.clear() - async def disconnect(self): pass @@ -217,7 +216,6 @@ class RedisCache(BaseCache): def flush_cache(self): self.redis_client.flushall() - async def disconnect(self): pass @@ -225,6 +223,102 @@ class RedisCache(BaseCache): self.redis_client.delete(key) +class RedisSemanticCache(RedisCache): + def __init__(self, host, port, password, **kwargs): + super().__init__() + + # from redis.commands.search.field import TagField, TextField, NumericField, VectorField + # from redis.commands.search.indexDefinition import IndexDefinition, IndexType + # from redis.commands.search.query import Query + + # INDEX_NAME = 'idx:litellm_completion_response_vss' + # DOC_PREFIX = 'bikes:' + + # try: + # # check to see if index exists + # client.ft(INDEX_NAME).info() + # print('Index already exists!') + # except: + # # schema + # schema = ( + # TextField('$.model', no_stem=True, as_name='model'), + # TextField('$.brand', no_stem=True, as_name='brand'), + # NumericField('$.price', as_name='price'), + # TagField('$.type', as_name='type'), + # TextField('$.description', as_name='description'), + # VectorField('$.description_embeddings', + # 'FLAT', { + # 'TYPE': 'FLOAT32', + # 'DIM': VECTOR_DIMENSION, + # 'DISTANCE_METRIC': 'COSINE', + # }, as_name='vector' + # ), + # ) + + # # index Definition + # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) + + # # create Index + # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + + def set_cache(self, key, value, **kwargs): + ttl = kwargs.get("ttl", None) + print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") + try: + # get text response + # print("in redis semantic cache: value: ", value) + llm_response = value["response"] + + # if llm_response is a string, convert it to a dictionary + if isinstance(llm_response, str): + llm_response = json.loads(llm_response) + + # print("converted llm_response: ", llm_response) + response = llm_response["choices"][0]["message"]["content"] + + # create embedding response + + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=response, + cache={"no-store": True}, + ) + + raw_embedding = embedding_response["data"][0]["embedding"] + raw_embedding_dimension = len(raw_embedding) + + # print("embedding: ", raw_embedding) + key = "litellm-semantic:" + key + self.redis_client.json().set( + name=key, + path="$", + obj=json.dumps( + { + "response": response, + "embedding": raw_embedding, + "dimension": raw_embedding_dimension, + } + ), + ) + + stored_redis_value = self.redis_client.json().get(name=key) + + # print("Stored Redis Value: ", stored_redis_value) + + except Exception as e: + # print("Error occurred: ", e) + # NON blocking - notify users Redis is throwing an exception + logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + + def get_cache(self, key, **kwargs): + pass + + async def async_set_cache(self, key, value, **kwargs): + pass + + async def async_get_cache(self, key, **kwargs): + pass + class S3Cache(BaseCache): def __init__( @@ -429,7 +523,7 @@ class DualCache(BaseCache): class Cache: def __init__( self, - type: Optional[Literal["local", "redis", "s3"]] = "local", + type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local", host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, @@ -468,6 +562,8 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) + elif type == "redis-semantic": + self.cache = RedisSemanticCache(host, port, password, **kwargs) elif type == "local": self.cache = InMemoryCache() elif type == "s3": diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 468ab6f80f..32904ab784 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -987,3 +987,28 @@ def test_cache_context_managers(): # test_cache_context_managers() + + +def test_redis_semantic_cache_completion(): + litellm.set_verbose = False + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + messages = [ + {"role": "user", "content": f"write a one sentence poem about: {random_number}"} + ] + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + ) + print("test2 for Redis Caching - non streaming") + response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) + # response2 = completion( + # model="gpt-3.5-turbo", messages=messages,max_tokens=20 + # ) + + +# test_redis_cache_completion() From 1b975bbe28d5bba1d6690e92cb62576e82afe316 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 5 Feb 2024 16:16:15 -0800 Subject: [PATCH 002/148] fix(langfuse.py): support logging failed llm api calls to langfuse --- litellm/integrations/langfuse.py | 198 +++++++++++++++++++------------ litellm/utils.py | 58 ++++----- 2 files changed, 151 insertions(+), 105 deletions(-) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index e62dccdc47..82de333660 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -55,8 +55,21 @@ class LangFuseLogger: else: self.upstream_langfuse = None + # def log_error(kwargs, response_obj, start_time, end_time): + # generation = trace.generation( + # level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR + # status_message='error' # can be any string (e.g. stringified stack trace or error body) + # ) def log_event( - self, kwargs, response_obj, start_time, end_time, user_id, print_verbose + self, + kwargs, + response_obj, + start_time, + end_time, + user_id, + print_verbose, + level="DEFAULT", + status_message=None, ): # Method definition @@ -84,37 +97,49 @@ class LangFuseLogger: pass # end of processing langfuse ######################## - if kwargs.get("call_type", None) == "embedding" or isinstance( - response_obj, litellm.EmbeddingResponse + if ( + level == "ERROR" + and status_message is not None + and isinstance(status_message, str) + ): + input = prompt + output = status_message + elif response_obj is not None and ( + kwargs.get("call_type", None) == "embedding" + or isinstance(response_obj, litellm.EmbeddingResponse) ): input = prompt output = response_obj["data"] - else: + elif response_obj is not None: input = prompt output = response_obj["choices"][0]["message"].json() - print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}") - self._log_langfuse_v2( - user_id, - metadata, - output, - start_time, - end_time, - kwargs, - optional_params, - input, - response_obj, - print_verbose, - ) if self._is_langfuse_v2() else self._log_langfuse_v1( - user_id, - metadata, - output, - start_time, - end_time, - kwargs, - optional_params, - input, - response_obj, - ) + print(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}") + if self._is_langfuse_v2(): + self._log_langfuse_v2( + user_id, + metadata, + output, + start_time, + end_time, + kwargs, + optional_params, + input, + response_obj, + level, + print_verbose, + ) + elif response_obj is not None: + self._log_langfuse_v1( + user_id, + metadata, + output, + start_time, + end_time, + kwargs, + optional_params, + input, + response_obj, + ) self.Langfuse.flush() print_verbose( @@ -123,15 +148,15 @@ class LangFuseLogger: verbose_logger.info(f"Langfuse Layer Logging - logging success") except: traceback.print_exc() - print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}") + print(f"Langfuse Layer Error - {traceback.format_exc()}") pass async def _async_log_event( self, kwargs, response_obj, start_time, end_time, user_id, print_verbose ): - self.log_event( - kwargs, response_obj, start_time, end_time, user_id, print_verbose - ) + """ + TODO: support async calls when langfuse is truly async + """ def _is_langfuse_v2(self): import langfuse @@ -193,57 +218,78 @@ class LangFuseLogger: optional_params, input, response_obj, + level, print_verbose, ): import langfuse - tags = [] - supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3") - supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3") + try: + tags = [] + supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3") + supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3") - print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ") + print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ") - generation_name = metadata.get("generation_name", None) - if generation_name is None: - # just log `litellm-{call_type}` as the generation name - generation_name = f"litellm-{kwargs.get('call_type', 'completion')}" + generation_name = metadata.get("generation_name", None) + if generation_name is None: + # just log `litellm-{call_type}` as the generation name + generation_name = f"litellm-{kwargs.get('call_type', 'completion')}" - trace_params = { - "name": generation_name, - "input": input, - "output": output, - "user_id": metadata.get("trace_user_id", user_id), - "id": metadata.get("trace_id", None), - "session_id": metadata.get("session_id", None), - } - cost = kwargs["response_cost"] - print_verbose(f"trace: {cost}") - if supports_tags: - for key, value in metadata.items(): - tags.append(f"{key}:{value}") - if "cache_hit" in kwargs: - tags.append(f"cache_hit:{kwargs['cache_hit']}") - trace_params.update({"tags": tags}) + trace_params = { + "name": generation_name, + "input": input, + "user_id": metadata.get("trace_user_id", user_id), + "id": metadata.get("trace_id", None), + "session_id": metadata.get("session_id", None), + } - trace = self.Langfuse.trace(**trace_params) + if level == "ERROR": + trace_params["status_message"] = output + else: + trace_params["output"] = output - # get generation_id - generation_id = None - if response_obj.get("id", None) is not None: - generation_id = litellm.utils.get_logging_id(start_time, response_obj) - trace.generation( - name=generation_name, - id=metadata.get("generation_id", generation_id), - startTime=start_time, - endTime=end_time, - model=kwargs["model"], - modelParameters=optional_params, - input=input, - output=output, - usage={ - "prompt_tokens": response_obj["usage"]["prompt_tokens"], - "completion_tokens": response_obj["usage"]["completion_tokens"], - "total_cost": cost if supports_costs else None, - }, - metadata=metadata, - ) + cost = kwargs.get("response_cost", None) + print_verbose(f"trace: {cost}") + if supports_tags: + for key, value in metadata.items(): + tags.append(f"{key}:{value}") + if "cache_hit" in kwargs: + tags.append(f"cache_hit:{kwargs['cache_hit']}") + trace_params.update({"tags": tags}) + + trace = self.Langfuse.trace(**trace_params) + + if level == "ERROR": + trace.generation( + level="ERROR", # can be any of DEBUG, DEFAULT, WARNING or ERROR + status_message=output, # can be any string (e.g. stringified stack trace or error body) + ) + print(f"SUCCESSFULLY LOGGED ERROR") + else: + # get generation_id + generation_id = None + if ( + response_obj is not None + and response_obj.get("id", None) is not None + ): + generation_id = litellm.utils.get_logging_id( + start_time, response_obj + ) + trace.generation( + name=generation_name, + id=metadata.get("generation_id", generation_id), + startTime=start_time, + endTime=end_time, + model=kwargs["model"], + modelParameters=optional_params, + input=input, + output=output, + usage={ + "prompt_tokens": response_obj["usage"]["prompt_tokens"], + "completion_tokens": response_obj["usage"]["completion_tokens"], + "total_cost": cost if supports_costs else None, + }, + metadata=metadata, + ) + except Exception as e: + print(f"Langfuse Layer Error - {traceback.format_exc()}") diff --git a/litellm/utils.py b/litellm/utils.py index e56ba879f8..1e83a319f4 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1636,34 +1636,6 @@ class Logging: end_time=end_time, print_verbose=print_verbose, ) - if callback == "langfuse": - global langFuseLogger - print_verbose("reaches Async langfuse for logging!") - kwargs = {} - for k, v in self.model_call_details.items(): - if ( - k != "original_response" - ): # copy.deepcopy raises errors as this could be a coroutine - kwargs[k] = v - # this only logs streaming once, complete_streaming_response exists i.e when stream ends - if self.stream: - if "complete_streaming_response" not in kwargs: - return - else: - print_verbose( - "reaches Async langfuse for streaming logging!" - ) - result = kwargs["complete_streaming_response"] - if langFuseLogger is None: - langFuseLogger = LangFuseLogger() - await langFuseLogger._async_log_event( - kwargs=kwargs, - response_obj=result, - start_time=start_time, - end_time=end_time, - user_id=kwargs.get("user", None), - print_verbose=print_verbose, - ) except: print_verbose( f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}" @@ -1788,9 +1760,37 @@ class Logging: response_obj=result, kwargs=self.model_call_details, ) + elif callback == "langfuse": + global langFuseLogger + verbose_logger.debug("reaches langfuse for logging!") + kwargs = {} + for k, v in self.model_call_details.items(): + if ( + k != "original_response" + ): # copy.deepcopy raises errors as this could be a coroutine + kwargs[k] = v + # this only logs streaming once, complete_streaming_response exists i.e when stream ends + if langFuseLogger is None or ( + self.langfuse_public_key != langFuseLogger.public_key + and self.langfuse_secret != langFuseLogger.secret_key + ): + langFuseLogger = LangFuseLogger( + langfuse_public_key=self.langfuse_public_key, + langfuse_secret=self.langfuse_secret, + ) + langFuseLogger.log_event( + start_time=start_time, + end_time=end_time, + response_obj=None, + user_id=kwargs.get("user", None), + print_verbose=print_verbose, + status_message=str(exception), + level="ERROR", + kwargs=self.model_call_details, + ) except Exception as e: print_verbose( - f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}" + f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {str(e)}" ) print_verbose( f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}" From 3d2a7b68bee2496cb2e229b06a8ea55f14abeaca Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Mon, 5 Feb 2024 17:07:57 -0800 Subject: [PATCH 003/148] Update model_prices_and_context_window.json --- model_prices_and_context_window.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index b6ded001c9..4c28bdbe8b 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -156,8 +156,8 @@ "max_tokens": 4097, "max_input_tokens": 4097, "max_output_tokens": 4096, - "input_cost_per_token": 0.000012, - "output_cost_per_token": 0.000016, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000006, "litellm_provider": "openai", "mode": "chat" }, From ab5956022f0ffd1b62ce9562f4c631cdd8038009 Mon Sep 17 00:00:00 2001 From: John HU Date: Mon, 5 Feb 2024 17:30:39 -0800 Subject: [PATCH 004/148] Fix admin UI title and description --- ui/litellm-dashboard/src/app/layout.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/litellm-dashboard/src/app/layout.tsx b/ui/litellm-dashboard/src/app/layout.tsx index 3314e4780a..a04a0d66ed 100644 --- a/ui/litellm-dashboard/src/app/layout.tsx +++ b/ui/litellm-dashboard/src/app/layout.tsx @@ -5,8 +5,8 @@ import "./globals.css"; const inter = Inter({ subsets: ["latin"] }); export const metadata: Metadata = { - title: "Create Next App", - description: "Generated by create next app", + title: "🚅 LiteLLM", + description: "LiteLLM Proxy Admin UI", }; export default function RootLayout({ From 2955f8ed391cb0fd818c9967e838c441adf0a5a1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:12 -0800 Subject: [PATCH 005/148] (feat) working - sync semantic caching --- litellm/caching.py | 227 ++++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 75 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index e1ef95dc34..0a1046f0d8 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -223,94 +223,161 @@ class RedisCache(BaseCache): self.redis_client.delete(key) -class RedisSemanticCache(RedisCache): - def __init__(self, host, port, password, **kwargs): - super().__init__() +class RedisSemanticCache(BaseCache): + def __init__( + self, + host=None, + port=None, + password=None, + redis_url=None, + similarity_threshold=None, + **kwargs, + ): + from redisvl.index import SearchIndex + from redisvl.query import VectorQuery - # from redis.commands.search.field import TagField, TextField, NumericField, VectorField - # from redis.commands.search.indexDefinition import IndexDefinition, IndexType - # from redis.commands.search.query import Query + print_verbose( + "redis semantic-cache initializing INDEX - litellm_semantic_cache_index" + ) + if similarity_threshold is None: + raise Exception("similarity_threshold must be provided, passed None") + self.similarity_threshold = similarity_threshold + schema = { + "index": { + "name": "litellm_semantic_cache_index", + "prefix": "litellm", + "storage_type": "hash", + }, + "fields": { + "text": [{"name": "response"}], + "text": [{"name": "prompt"}], + "vector": [ + { + "name": "litellm_embedding", + "dims": 1536, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + } + ], + }, + } + self.index = SearchIndex.from_dict(schema) + if redis_url is None: + # if no url passed, check if host, port and password are passed, if not raise an Exception + if host is None or port is None or password is None: + raise Exception(f"Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port + print_verbose(f"redis semantic-cache redis_url: {redis_url}") + self.index.connect(redis_url=redis_url) + self.index.create(overwrite=False) # don't overwrite existing index - # INDEX_NAME = 'idx:litellm_completion_response_vss' - # DOC_PREFIX = 'bikes:' + def _get_cache_logic(self, cached_response: Any): + """ + Common 'get_cache_logic' across sync + async redis client implementations + """ + if cached_response is None: + return cached_response - # try: - # # check to see if index exists - # client.ft(INDEX_NAME).info() - # print('Index already exists!') - # except: - # # schema - # schema = ( - # TextField('$.model', no_stem=True, as_name='model'), - # TextField('$.brand', no_stem=True, as_name='brand'), - # NumericField('$.price', as_name='price'), - # TagField('$.type', as_name='type'), - # TextField('$.description', as_name='description'), - # VectorField('$.description_embeddings', - # 'FLAT', { - # 'TYPE': 'FLOAT32', - # 'DIM': VECTOR_DIMENSION, - # 'DISTANCE_METRIC': 'COSINE', - # }, as_name='vector' - # ), - # ) + # check if cached_response is bytes + if isinstance(cached_response, bytes): + cached_response = cached_response.decode("utf-8") - # # index Definition - # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) - - # # create Index - # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + try: + cached_response = json.loads( + cached_response + ) # Convert string to dictionary + except: + cached_response = ast.literal_eval(cached_response) + return cached_response def set_cache(self, key, value, **kwargs): - ttl = kwargs.get("ttl", None) - print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") - try: - # get text response - # print("in redis semantic cache: value: ", value) - llm_response = value["response"] + import numpy as np - # if llm_response is a string, convert it to a dictionary - if isinstance(llm_response, str): - llm_response = json.loads(llm_response) + print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}") - # print("converted llm_response: ", llm_response) - response = llm_response["choices"][0]["message"]["content"] + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] - # create embedding response + # create an embedding for prompt + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) - embedding_response = litellm.embedding( - model="text-embedding-ada-002", - input=response, - cache={"no-store": True}, - ) + # get the embedding + embedding = embedding_response["data"][0]["embedding"] - raw_embedding = embedding_response["data"][0]["embedding"] - raw_embedding_dimension = len(raw_embedding) + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) - # print("embedding: ", raw_embedding) - key = "litellm-semantic:" + key - self.redis_client.json().set( - name=key, - path="$", - obj=json.dumps( - { - "response": response, - "embedding": raw_embedding, - "dimension": raw_embedding_dimension, - } - ), - ) + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] - stored_redis_value = self.redis_client.json().get(name=key) + # Add more data + keys = self.index.load(new_data) - # print("Stored Redis Value: ", stored_redis_value) - - except Exception as e: - # print("Error occurred: ", e) - # NON blocking - notify users Redis is throwing an exception - logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + pass def get_cache(self, key, **kwargs): + print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + num_results=1, + ) + + results = self.index.query(query) + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None + pass async def async_set_cache(self, key, value, **kwargs): @@ -527,6 +594,7 @@ class Cache: host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, + similarity_threshold: Optional[float] = None, supported_call_types: Optional[ List[Literal["completion", "acompletion", "embedding", "aembedding"]] ] = ["completion", "acompletion", "embedding", "aembedding"], @@ -547,10 +615,12 @@ class Cache: Initializes the cache based on the given type. Args: - type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local". + type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local". host (str, optional): The host address for the Redis cache. Required if type is "redis". port (int, optional): The port number for the Redis cache. Required if type is "redis". password (str, optional): The password for the Redis cache. Required if type is "redis". + similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic" + supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types. **kwargs: Additional keyword arguments for redis.Redis() cache @@ -563,7 +633,13 @@ class Cache: if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) elif type == "redis-semantic": - self.cache = RedisSemanticCache(host, port, password, **kwargs) + self.cache = RedisSemanticCache( + host, + port, + password, + similarity_threshold=similarity_threshold, + **kwargs, + ) elif type == "local": self.cache = InMemoryCache() elif type == "s3": @@ -743,6 +819,7 @@ class Cache: The cached result if it exists, otherwise None. """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -752,7 +829,7 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = self.cache.get_cache(cache_key) + cached_result = self.cache.get_cache(cache_key, messages=messages) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From 3ee691b9d81834f0cead36493b4f9b1b7d5bb51b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:32 -0800 Subject: [PATCH 006/148] (test) semantic cache --- litellm/tests/test_caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 32904ab784..3ac812cf35 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -990,7 +990,7 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): - litellm.set_verbose = False + litellm.set_verbose = True random_number = random.randint( 1, 100000 @@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion(): host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.5, ) print("test2 for Redis Caching - non streaming") response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) From 6c6db6f1e6aed9dba389b5dc8a4dfa1ff3b2d74e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:22:50 -0800 Subject: [PATCH 007/148] (test) semantic caching --- litellm/tests/test_caching.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 3ac812cf35..4b47614cca 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 ) # add a random number to ensure it's always adding / reading from cache - messages = [ - {"role": "user", "content": f"write a one sentence poem about: {random_number}"} - ] + + print("testing semantic caching") litellm.cache = Cache( type="redis-semantic", host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.5, + similarity_threshold=0.8, ) - print("test2 for Redis Caching - non streaming") - response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) - # response2 = completion( - # model="gpt-3.5-turbo", messages=messages,max_tokens=20 - # ) + response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" + # assert response1.choices[0].message == 1 # test_redis_cache_completion() From eaad671e40ffeeb09c64bd6338bd999f173aa93e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:25:22 -0800 Subject: [PATCH 008/148] (fix) semantic cache --- litellm/caching.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index 0a1046f0d8..877f935fab 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache): redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") self.index.connect(redis_url=redis_url) - self.index.create(overwrite=False) # don't overwrite existing index + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") def _get_cache_logic(self, cached_response: Any): """ From 75b892835ccef169f5c0c2b810ed31809aac4024 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 5 Feb 2024 21:44:27 -0800 Subject: [PATCH 009/148] test(test_key_generate_dynamodb.py): fix test --- litellm/tests/test_key_generate_dynamodb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_key_generate_dynamodb.py b/litellm/tests/test_key_generate_dynamodb.py index 61d0ff6a66..e77dc74723 100644 --- a/litellm/tests/test_key_generate_dynamodb.py +++ b/litellm/tests/test_key_generate_dynamodb.py @@ -490,8 +490,13 @@ def test_dynamo_db_migration(custom_db_client): try: async def test(): + request = GenerateKeyRequest(max_budget=1) + key = await generate_key_fn(request) + print(key) + + generated_key = key.key bearer_token = ( - "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg" + "Bearer " + generated_key ) # this works with ishaan's db, it's a never expiring key request = Request(scope={"type": "http"}) From 756eb290f66d3fc8013332381334d99e0d510d27 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:37:05 -0800 Subject: [PATCH 010/148] (docs) upperbound_key_generate_params --- docs/my-website/docs/proxy/virtual_keys.md | 16 ++++++++++++++++ .../model_prices_and_context_window_backup.json | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md index dd5edc6da8..c51bfc0ac9 100644 --- a/docs/my-website/docs/proxy/virtual_keys.md +++ b/docs/my-website/docs/proxy/virtual_keys.md @@ -352,6 +352,22 @@ Request Params: } ``` +## Upperbound /key/generate params +Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. + +Set `litellm_settings:upperbound_key_generate_params`: +```yaml +litellm_settings: + upperbound_key_generate_params: + max_budget: 100 # upperbound of $100, for all /key/generate requests + duration: "30d" # upperbound of 30 days for all /key/generate requests +``` + +** Expected Behavior ** + +- Send a `/key/generate` request with `max_budget=200` +- Key will be created with `max_budget=100` since 100 is the upper bound + ## Default /key/generate params Use this, if you need to control the default `max_budget` or any `key/generate` param per key. diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index b6ded001c9..4c28bdbe8b 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -156,8 +156,8 @@ "max_tokens": 4097, "max_input_tokens": 4097, "max_output_tokens": 4096, - "input_cost_per_token": 0.000012, - "output_cost_per_token": 0.000016, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000006, "litellm_provider": "openai", "mode": "chat" }, From 0ca4f962d9615b1fd64979e843855d7fa1ef0477 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:38:47 -0800 Subject: [PATCH 011/148] (feat) upperbound_key_generate_params --- litellm/__init__.py | 1 + litellm/proxy/proxy_server.py | 69 +++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index 3f2a1e4b4d..26b761c64a 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -146,6 +146,7 @@ suppress_debug_info = False dynamodb_table_name: Optional[str] = None s3_callback_params: Optional[Dict] = None default_key_generate_params: Optional[Dict] = None +upperbound_key_generate_params: Optional[Dict] = None default_team_settings: Optional[List] = None #### RELIABILITY #### request_timeout: Optional[float] = 6000 diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 289a36cb2b..494c874147 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1391,6 +1391,26 @@ class ProxyConfig: proxy_config = ProxyConfig() +def _duration_in_seconds(duration: str): + match = re.match(r"(\d+)([smhd]?)", duration) + if not match: + raise ValueError("Invalid duration format") + + value, unit = match.groups() + value = int(value) + + if unit == "s": + return value + elif unit == "m": + return value * 60 + elif unit == "h": + return value * 3600 + elif unit == "d": + return value * 86400 + else: + raise ValueError("Unsupported duration unit") + + async def generate_key_helper_fn( duration: Optional[str], models: list, @@ -1425,25 +1445,6 @@ async def generate_key_helper_fn( if token is None: token = f"sk-{secrets.token_urlsafe(16)}" - def _duration_in_seconds(duration: str): - match = re.match(r"(\d+)([smhd]?)", duration) - if not match: - raise ValueError("Invalid duration format") - - value, unit = match.groups() - value = int(value) - - if unit == "s": - return value - elif unit == "m": - return value * 60 - elif unit == "h": - return value * 3600 - elif unit == "d": - return value * 86400 - else: - raise ValueError("Unsupported duration unit") - if duration is None: # allow tokens that never expire expires = None else: @@ -2660,6 +2661,36 @@ async def generate_key_fn( elif key == "metadata" and value == {}: setattr(data, key, litellm.default_key_generate_params.get(key, {})) + # check if user set default key/generate params on config.yaml + if litellm.upperbound_key_generate_params is not None: + for elem in data: + # if key in litellm.upperbound_key_generate_params, use the min of value and litellm.upperbound_key_generate_params[key] + key, value = elem + if value is not None and key in litellm.upperbound_key_generate_params: + # if value is float/int + if key in [ + "max_budget", + "max_parallel_requests", + "tpm_limit", + "rpm_limit", + ]: + if value > litellm.upperbound_key_generate_params[key]: + # directly compare floats/ints + setattr( + data, key, litellm.upperbound_key_generate_params[key] + ) + elif key == "budget_duration": + # budgets are in 1s, 1m, 1h, 1d, 1m (30s, 30m, 30h, 30d, 30m) + # compare the duration in seconds and max duration in seconds + upperbound_budget_duration = _duration_in_seconds( + duration=litellm.upperbound_key_generate_params[key] + ) + user_set_budget_duration = _duration_in_seconds(duration=value) + if user_set_budget_duration > upperbound_budget_duration: + setattr( + data, key, litellm.upperbound_key_generate_params[key] + ) + data_json = data.json() # type: ignore # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users From a8bb3a4adac0d6dab8fbf2796c00ff2b95e801bc Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:39:36 -0800 Subject: [PATCH 012/148] (test) test_upperbound_key_params --- litellm/tests/test_key_generate_prisma.py | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index de26168591..b4c86afb25 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -1279,6 +1279,40 @@ async def test_default_key_params(prisma_client): pytest.fail(f"Got exception {e}") +@pytest.mark.asyncio() +async def test_upperbound_key_params(prisma_client): + """ + - create key + - get key info + - assert key_name is not null + """ + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + litellm.upperbound_key_generate_params = { + "max_budget": 0.001, + "budget_duration": "1m", + } + await litellm.proxy.proxy_server.prisma_client.connect() + try: + request = GenerateKeyRequest( + max_budget=200000, + budget_duration="30d", + ) + key = await generate_key_fn(request) + generated_key = key.key + + result = await info_key_fn(key=generated_key) + key_info = result["info"] + # assert it used the upper bound for max_budget, and budget_duration + assert key_info["max_budget"] == 0.001 + assert key_info["budget_duration"] == "1m" + + print(result) + except Exception as e: + print("Got Exception", e) + pytest.fail(f"Got exception {e}") + + def test_get_bearer_token(): from litellm.proxy.proxy_server import _get_bearer_token From 255440ed36f2013ce2c82992f3cd08fecea93a9a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:40:52 -0800 Subject: [PATCH 013/148] (feat) proxy - upperbound params /key/generate --- litellm/proxy/proxy_config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 874049a752..bd844bd7ba 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -73,6 +73,9 @@ litellm_settings: max_budget: 1.5000 models: ["azure-gpt-3.5"] duration: None + upperbound_key_generate_params: + max_budget: 100 + duration: "30d" # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 77273205d6fc89a6c20da65685158b0c90e928df Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:51:08 -0800 Subject: [PATCH 014/148] (fix) proxy startup test --- .../test_configs/test_config_no_auth.yaml | 95 ------------------- 1 file changed, 95 deletions(-) diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml index 8441018e35..ccebe016db 100644 --- a/litellm/tests/test_configs/test_config_no_auth.yaml +++ b/litellm/tests/test_configs/test_config_no_auth.yaml @@ -9,21 +9,11 @@ model_list: api_key: os.environ/AZURE_CANADA_API_KEY model: azure/gpt-35-turbo model_name: azure-model -- litellm_params: - api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1 - api_key: os.environ/AZURE_API_KEY - model: azure/chatgpt-v-2 - model_name: azure-cloudflare-model - litellm_params: api_base: https://openai-france-1234.openai.azure.com api_key: os.environ/AZURE_FRANCE_API_KEY model: azure/gpt-turbo model_name: azure-model -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - model_name: test_openai_models - litellm_params: model: gpt-3.5-turbo model_info: @@ -36,93 +26,8 @@ model_list: description: this is a test openai model id: 4d1ee26c-abca-450c-8744-8e87fd6755e9 model_name: test_openai_models -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 00e19c0f-b63d-42bb-88e9-016fb0c60764 - model_name: test_openai_models -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 79fc75bf-8e1b-47d5-8d24-9365a854af03 - model_name: test_openai_models -- litellm_params: - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY - api_version: 2023-07-01-preview - model: azure/azure-embedding-model - model_info: - mode: embedding - model_name: azure-embedding-model -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 55848c55-4162-40f9-a6e2-9a722b9ef404 - model_name: test_openai_models -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 34339b1e-e030-4bcc-a531-c48559f10ce4 - model_name: test_openai_models -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: f6f74e14-ac64-4403-9365-319e584dcdc5 - model_name: test_openai_models -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 9b1ef341-322c-410a-8992-903987fef439 - model_name: test_openai_models - litellm_params: model: bedrock/amazon.titan-embed-text-v1 model_info: mode: embedding model_name: amazon-embeddings -- litellm_params: - model: sagemaker/berri-benchmarking-gpt-j-6b-fp16 - model_info: - mode: embedding - model_name: GPT-J 6B - Sagemaker Text Embedding (Internal) -- litellm_params: - model: dall-e-3 - model_info: - mode: image_generation - model_name: dall-e-3 -- litellm_params: - api_base: os.environ/AZURE_SWEDEN_API_BASE - api_key: os.environ/AZURE_SWEDEN_API_KEY - api_version: 2023-12-01-preview - model: azure/dall-e-3-test - model_info: - mode: image_generation - model_name: dall-e-3 -- litellm_params: - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY - api_version: 2023-06-01-preview - model: azure/ - model_info: - mode: image_generation - model_name: dall-e-2 -- litellm_params: - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY - api_version: 2023-07-01-preview - model: azure/azure-embedding-model - model_info: - base_model: text-embedding-ada-002 - mode: embedding - model_name: text-embedding-ada-002 -- litellm_params: - model: gpt-3.5-turbo - model_info: - description: this is a test openai model - id: 34cb2419-7c63-44ae-a189-53f1d1ce5953 - model_name: test_openai_models From fe18c842745022599388b51c23416a636f538c7c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 22:53:31 -0800 Subject: [PATCH 015/148] (ci/cd) print debug info for test_proxy_gunicorn_startup_config_dict --- litellm/tests/test_proxy_startup.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_proxy_startup.py index 650e2f8a7a..a846c9f4a3 100644 --- a/litellm/tests/test_proxy_startup.py +++ b/litellm/tests/test_proxy_startup.py @@ -33,6 +33,11 @@ def test_proxy_gunicorn_startup_direct_config(): Test both approaches """ try: + from litellm._logging import verbose_proxy_logger, verbose_router_logger + import logging + + verbose_proxy_logger.setLevel(level=logging.DEBUG) + verbose_router_logger.setLevel(level=logging.DEBUG) filepath = os.path.dirname(os.path.abspath(__file__)) # test with worker_config = config yaml config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" @@ -48,6 +53,11 @@ def test_proxy_gunicorn_startup_direct_config(): def test_proxy_gunicorn_startup_config_dict(): try: + from litellm._logging import verbose_proxy_logger, verbose_router_logger + import logging + + verbose_proxy_logger.setLevel(level=logging.DEBUG) + verbose_router_logger.setLevel(level=logging.DEBUG) filepath = os.path.dirname(os.path.abspath(__file__)) # test with worker_config = config yaml config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" From de4d36cf969c29f6e77510928437ce133e11eed8 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 06:46:49 -0800 Subject: [PATCH 016/148] (fix) test_normal_router_tpm_limit --- litellm/tests/test_parallel_request_limiter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py index 34dc0e3b57..528bb19d2a 100644 --- a/litellm/tests/test_parallel_request_limiter.py +++ b/litellm/tests/test_parallel_request_limiter.py @@ -379,6 +379,7 @@ async def test_normal_router_tpm_limit(): ) except Exception as e: + print("Exception on test_normal_router_tpm_limit", e) assert e.status_code == 429 From 9a526b6cd4a0dd09b13ff002ae2f86ddeaf00804 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:22:54 -0800 Subject: [PATCH 017/148] fix(ollama_chat.py): fix ollama chat completion token counting --- litellm/llms/ollama_chat.py | 8 ++++++-- litellm/utils.py | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 95ff8dfaa3..3628ae2903 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -320,11 +320,15 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): model_response["choices"][0]["message"] = message else: model_response["choices"][0]["message"] = response_json["message"] + model_response["created"] = int(time.time()) - model_response["model"] = "ollama/" + data["model"] + model_response["model"] = "ollama_chat/" + data["model"] prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore completion_tokens = response_json.get( - "eval_count", litellm.token_counter(text=response_json["message"]) + "eval_count", + litellm.token_counter( + text=response_json["message"]["content"], count_response_tokens=True + ), ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, diff --git a/litellm/utils.py b/litellm/utils.py index 1e83a319f4..8491a1d5e1 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -983,9 +983,6 @@ class Logging: verbose_logger.debug( f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n" ) - verbose_logger.debug( - f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}" - ) if self.logger_fn and callable(self.logger_fn): try: self.logger_fn( From 9b2a2e6c8b925514fade0b793737db2161f687b3 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:26:13 -0800 Subject: [PATCH 018/148] fix(utils.py): use print_verbose for statements, so debug can be seen when running sdk --- litellm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 8491a1d5e1..5ccb85ef05 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -980,7 +980,7 @@ class Logging: self.model_call_details["log_event_type"] = "post_api_call" # User Logging -> if you pass in a custom logging function - verbose_logger.debug( + print_verbose( f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n" ) if self.logger_fn and callable(self.logger_fn): From d100bed3be7b69d8482b4e16d756b3bc28568a96 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:35:46 -0800 Subject: [PATCH 019/148] build(requirements.txt): update the proxy requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c9bd0e511d..768e8dff3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions -google-generativeai==0.1.0 # for vertex ai calls +google-generativeai==0.3.2 # for vertex ai calls async_generator==1.10.0 # for async ollama calls traceloop-sdk==0.5.3 # for open telemetry logging langfuse>=2.6.3 # for langfuse self-hosted logging From 50fb54883bb9656788fd93f421febadaaa6bfda7 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:43:47 -0800 Subject: [PATCH 020/148] fix(ollama_chat.py): explicitly state if ollama call is streaming or not --- litellm/llms/ollama_chat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 3628ae2903..d1a439398b 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -146,7 +146,12 @@ def get_ollama_response( optional_params[k] = v stream = optional_params.pop("stream", False) - data = {"model": model, "messages": messages, "options": optional_params} + data = { + "model": model, + "messages": messages, + "options": optional_params, + "stream": stream, + } ## LOGGING logging_obj.pre_call( input=None, From d1be2e008a4291962c056311b9c1efc8f91beb45 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:44:04 -0800 Subject: [PATCH 021/148] =?UTF-8?q?bump:=20version=201.22.6=20=E2=86=92=20?= =?UTF-8?q?1.22.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06dedbed63..be8c8966be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.6" +version = "1.22.7" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.6" +version = "1.22.7" version_files = [ "pyproject.toml:^version" ] From 5be26109f5c96e01c778bf4bf1739d5d85744311 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:13:12 -0800 Subject: [PATCH 022/148] (feat) RedisSemanticCache - async --- litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 877f935fab..ad37f2077c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache): password=None, redis_url=None, similarity_threshold=None, + use_async=False, **kwargs, ): from redisvl.index import SearchIndex @@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache): ], }, } - self.index = SearchIndex.from_dict(schema) if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: raise Exception(f"Redis host, port, and password must be provided") redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") - self.index.connect(redis_url=redis_url) + if use_async == False: + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url) + elif use_async == True: + schema["index"]["name"] = "litellm_semantic_cache_index_async" + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url, use_async=True) try: self.index.create(overwrite=False) # don't overwrite existing index except Exception as e: @@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache): # Add more data keys = self.index.load(new_data) - pass + return def get_cache(self, key, **kwargs): - print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np @@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache): ) results = self.index.query(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None vector_distance = results[0]["vector_distance"] vector_distance = float(vector_distance) @@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache): pass async def async_set_cache(self, key, value, **kwargs): - pass + import numpy as np + + print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") + + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + # create an embedding for prompt + + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) + + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] + + # Add more data + keys = await self.index.aload(new_data) + return async def async_get_cache(self, key, **kwargs): + print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + ) + results = await self.index.aquery(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None pass @@ -612,6 +707,7 @@ class Cache: s3_aws_secret_access_key: Optional[str] = None, s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, + redis_semantic_cache_use_async=False, **kwargs, ): """ @@ -641,6 +737,7 @@ class Cache: port, password, similarity_threshold=similarity_threshold, + use_async=redis_semantic_cache_use_async, **kwargs, ) elif type == "local": @@ -847,6 +944,7 @@ class Cache: Used for embedding calls in async wrapper """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -856,7 +954,9 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = await self.cache.async_get_cache(cache_key) + cached_result = await self.cache.async_get_cache( + cache_key, messages=messages + ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From 33f5ab8ba57a638f3aeeb601418b70b7880cbc76 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:14:54 -0800 Subject: [PATCH 023/148] (test) async semantic cache --- litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 4b47614cca..a1a42ff659 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -991,6 +991,9 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) random_number = random.randint( 1, 100000 @@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() + + +@pytest.mark.asyncio +async def test_redis_semantic_cache_acompletion(): + litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + + print("testing semantic caching") + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_use_async=True, + ) + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" From 0ddcebbf52370fd2bc9ca2f45385b24a9644e2fc Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:52:57 -0800 Subject: [PATCH 024/148] (feat) working semantic-cache on litellm proxy --- litellm/caching.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index ad37f2077c..a7958d074c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache): if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: - raise Exception(f"Redis host, port, and password must be provided") + # try checking env for host, port and password + import os + + host = os.getenv("REDIS_HOST") + port = os.getenv("REDIS_PORT") + password = os.getenv("REDIS_PASSWORD") + if host is None or port is None or password is None: + raise Exception("Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") if use_async == False: self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url) + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") elif use_async == True: schema["index"]["name"] = "litellm_semantic_cache_index_async" self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url, use_async=True) - try: - self.index.create(overwrite=False) # don't overwrite existing index - except Exception as e: - print_verbose(f"Got exception creating semantic cache index: {str(e)}") + # def _get_cache_logic(self, cached_response: Any): """ Common 'get_cache_logic' across sync + async redis client implementations @@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + try: + await self.index.acreate(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") # get the prompt From 986a84a4f774b9020fe86569c7e5ee9a13cc1a00 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:54:36 -0800 Subject: [PATCH 025/148] (feat) redis-semantic cache --- litellm/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 8df027b874..d0aded4e55 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger from .proxy._types import KeyManagementSystem from openai import OpenAIError as OriginalError from openai._models import BaseModel as OpenAIObject -from .caching import S3Cache +from .caching import S3Cache, RedisSemanticCache from .exceptions import ( AuthenticationError, BadRequestError, @@ -2534,6 +2534,14 @@ def client(original_function): ): if len(cached_result) == 1 and cached_result[0] is None: cached_result = None + elif isinstance(litellm.cache.cache, RedisSemanticCache): + preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) + kwargs[ + "preset_cache_key" + ] = preset_cache_key # for streaming calls, we need to pass the preset_cache_key + cached_result = await litellm.cache.async_get_cache( + *args, **kwargs + ) else: preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) kwargs[ From 16a64e5c4beb229cebe500a602d77af671f2286e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:55:25 -0800 Subject: [PATCH 026/148] (feat) working semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 874049a752..41c3b41828 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -73,7 +73,12 @@ litellm_settings: max_budget: 1.5000 models: ["azure-gpt-3.5"] duration: None - # cache: True + cache: True # set cache responses to True + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 + redis_semantic_cache_use_async: True + # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 32639bf398ac076c0829a4fa8da5ca65230b6391 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 09:21:03 -0800 Subject: [PATCH 027/148] fix(utils.py): return finish reason for last vertex ai chunk --- litellm/proxy/proxy_server.py | 31 ++++++++++++++++++++++++++++--- litellm/utils.py | 34 ++++++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 494c874147..5c336ea91e 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1746,7 +1746,33 @@ async def async_data_generator(response, user_api_key_dict): done_message = "[DONE]" yield f"data: {done_message}\n\n" except Exception as e: - yield f"data: {str(e)}\n\n" + traceback.print_exc() + await proxy_logging_obj.post_call_failure_hook( + user_api_key_dict=user_api_key_dict, original_exception=e + ) + verbose_proxy_logger.debug( + f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`" + ) + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] + ) + if user_debug: + traceback.print_exc() + + if isinstance(e, HTTPException): + raise e + else: + error_traceback = traceback.format_exc() + error_msg = f"{str(e)}\n\n{error_traceback}" + + raise ProxyException( + message=getattr(e, "message", error_msg), + type=getattr(e, "type", "None"), + param=getattr(e, "param", "None"), + code=getattr(e, "status_code", 500), + ) def select_data_generator(response, user_api_key_dict): @@ -1754,7 +1780,7 @@ def select_data_generator(response, user_api_key_dict): # since boto3 - sagemaker does not support async calls, we should use a sync data_generator if hasattr( response, "custom_llm_provider" - ) and response.custom_llm_provider in ["sagemaker", "together_ai"]: + ) and response.custom_llm_provider in ["sagemaker"]: return data_generator( response=response, ) @@ -2239,7 +2265,6 @@ async def chat_completion( selected_data_generator = select_data_generator( response=response, user_api_key_dict=user_api_key_dict ) - return StreamingResponse( selected_data_generator, media_type="text/event-stream", diff --git a/litellm/utils.py b/litellm/utils.py index 5ccb85ef05..31eeaacab4 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -169,6 +169,8 @@ def map_finish_reason( return "stop" elif finish_reason == "SAFETY": # vertex ai return "content_filter" + elif finish_reason == "STOP": # vertex ai + return "stop" return finish_reason @@ -1305,7 +1307,7 @@ class Logging: ) if callback == "langfuse": global langFuseLogger - verbose_logger.debug("reaches langfuse for logging!") + verbose_logger.debug("reaches langfuse for success logging!") kwargs = {} for k, v in self.model_call_details.items(): if ( @@ -6706,7 +6708,13 @@ def exception_type( message=f"VertexAIException - {error_str}", model=model, llm_provider="vertex_ai", - response=original_exception.response, + response=httpx.Response( + status_code=429, + request=httpx.Request( + method="POST", + url=" https://cloud.google.com/vertex-ai/", + ), + ), ) elif ( "429 Quota exceeded" in error_str @@ -8341,13 +8349,20 @@ class CustomStreamWrapper: completion_obj["content"] = chunk.text elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"): try: - # print(chunk) - if hasattr(chunk, "text"): - # vertexAI chunks return - # MultiCandidateTextGenerationResponse(text=' ```python\n# This Python code says "Hi" 100 times.\n\n# Create', _prediction_response=Prediction(predictions=[{'candidates': [{'content': ' ```python\n# This Python code says "Hi" 100 times.\n\n# Create', 'author': '1'}], 'citationMetadata': [{'citations': None}], 'safetyAttributes': [{'blocked': False, 'scores': None, 'categories': None}]}], deployed_model_id='', model_version_id=None, model_resource_name=None, explanations=None), is_blocked=False, safety_attributes={}, candidates=[ ```python - # This Python code says "Hi" 100 times. - # Create]) - completion_obj["content"] = chunk.text + if hasattr(chunk, "candidates") == True: + try: + completion_obj["content"] = chunk.text + if hasattr(chunk.candidates[0], "finish_reason"): + model_response.choices[ + 0 + ].finish_reason = map_finish_reason( + chunk.candidates[0].finish_reason.name + ) + except: + if chunk.candidates[0].finish_reason.name == "SAFETY": + raise Exception( + f"The response was blocked by VertexAI. {str(chunk)}" + ) else: completion_obj["content"] = str(chunk) except StopIteration as e: @@ -8636,7 +8651,6 @@ class CustomStreamWrapper: or self.custom_llm_provider == "ollama_chat" or self.custom_llm_provider == "vertex_ai" ): - print_verbose(f"INSIDE ASYNC STREAMING!!!") print_verbose( f"value of async completion stream: {self.completion_stream}" ) From 974c0c2fd32b23dbc8bf5c180f5493c9e4f7705d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 028/148] (fix) add redisvl==0.0.7 --- .circleci/requirements.txt | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index 85b576bff2..4730fc28b1 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -10,4 +10,5 @@ anthropic boto3 orjson pydantic -google-cloud-aiplatform \ No newline at end of file +google-cloud-aiplatform +redisvl==0.0.7 # semantic caching \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c9bd0e511d..c58eda09a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching +redisvl==0.0.7 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.1.0 # for vertex ai calls From 617716752e34043362d3c5497da472a5041fcdf6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:31:57 -0800 Subject: [PATCH 029/148] (feat) log semantic_sim to langfuse --- litellm/caching.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index a7958d074c..133d1db6dd 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache): ) results = await self.index.aquery(query) if results == None: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None if isinstance(results, list): if len(results) == 0: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None vector_distance = results[0]["vector_distance"] @@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache): print_verbose( f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" ) + + # update kwargs["metadata"] with similarity, don't rewrite the original metadata + kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity + if similarity > self.similarity_threshold: # cache hit ! cached_value = results[0]["response"] @@ -968,7 +974,7 @@ class Cache: "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) cached_result = await self.cache.async_get_cache( - cache_key, messages=messages + cache_key, *args, **kwargs ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age From 220a90527f053f9e5f782d005373e02740696559 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 10:11:43 -0800 Subject: [PATCH 030/148] fix(ollama.py): support format for ollama --- litellm/llms/ollama.py | 10 +++++++++- litellm/llms/ollama_chat.py | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index d0bc24af4c..9339deb78d 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -146,7 +146,15 @@ def get_ollama_response( optional_params[k] = v stream = optional_params.pop("stream", False) - data = {"model": model, "prompt": prompt, "options": optional_params} + format = optional_params.pop("format", None) + data = { + "model": model, + "prompt": prompt, + "options": optional_params, + "stream": stream, + } + if format is not None: + data["format"] = format ## LOGGING logging_obj.pre_call( diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index d1a439398b..0311931b13 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -146,12 +146,15 @@ def get_ollama_response( optional_params[k] = v stream = optional_params.pop("stream", False) + format = optional_params.pop("format", None) data = { "model": model, "messages": messages, "options": optional_params, "stream": stream, } + if format is not None: + data["format"] = format ## LOGGING logging_obj.pre_call( input=None, From 51713765f7663d13730bc5abfdef62185cae1e20 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 10:12:13 -0800 Subject: [PATCH 031/148] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=20?= =?UTF-8?q?1.22.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be8c8966be..17d80ae8ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.7" +version = "1.22.8" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.7" +version = "1.22.8" version_files = [ "pyproject.toml:^version" ] From 3c71eb1e71d68e7c9a5f54bf9017fcedb206ac13 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:22:02 -0800 Subject: [PATCH 032/148] allow setting redis_semantic cache_embedding model --- litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 133d1db6dd..6bf53ea451 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache): redis_url=None, similarity_threshold=None, use_async=False, + embedding_model="text-embedding-ada-002", **kwargs, ): from redisvl.index import SearchIndex @@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache): if similarity_threshold is None: raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold + self.embedding_model = embedding_model schema = { "index": { "name": "litellm_semantic_cache_index", @@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache): # create an embedding for prompt embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache): # convert to embedding embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list try: await self.index.acreate(overwrite=False) # don't overwrite existing index @@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] # create an embedding for prompt - - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache): print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list # query @@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] - # convert to embedding - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -727,6 +755,7 @@ class Cache: s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, redis_semantic_cache_use_async=False, + redis_semantic_cache_embedding_model="text-embedding-ada-002", **kwargs, ): """ @@ -757,6 +786,7 @@ class Cache: password, similarity_threshold=similarity_threshold, use_async=redis_semantic_cache_use_async, + embedding_model=redis_semantic_cache_embedding_model, **kwargs, ) elif type == "local": From af4c02fbb3ede629e3e3a0826e4878f4b545f1a1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:27:33 -0800 Subject: [PATCH 033/148] (fix) use semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 41c3b41828..326544f41e 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -77,7 +77,7 @@ litellm_settings: cache_params: type: "redis-semantic" similarity_threshold: 0.8 - redis_semantic_cache_use_async: True + redis_semantic_cache_embedding_model: azure-embedding-model # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From b845f437b73e9121d558c3959576f66137e33536 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:32:07 -0800 Subject: [PATCH 034/148] (docs) using semantic caching on proxy --- docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 03bb9fed34..3f26878241 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -9,7 +9,7 @@ LiteLLM supports: - Redis Cache - s3 Bucket Cache -## Quick Start - Redis, s3 Cache +## Quick Start - Redis, s3 Cache, Semantic Cache @@ -84,6 +84,56 @@ litellm_settings: $ litellm --config /path/to/config.yaml ``` + + + + +Caching can be enabled by adding the `cache` key in the `config.yaml` + +### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: azure-embedding-model + litellm_params: + model: azure/azure-embedding-model + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 # similarity threshold for semantic cache + redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list +``` + +### Step 2: Add Redis Credentials to .env +Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching. + + ```shell + REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' + ## OR ## + REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' + REDIS_PORT = "" # REDIS_PORT='18841' + REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' + ``` + +**Additional kwargs** +You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +```shell +REDIS_ = "" +``` + +### Step 3: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + From 3b869d7cd1ff7f04310b23bd9d0b23402bffad08 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 035/148] (feat) redis-semantic cache on proxy --- litellm/proxy/proxy_server.py | 6 ++++-- requirements.txt | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 0501ec7460..70e602e999 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1135,7 +1135,7 @@ class ProxyConfig: verbose_proxy_logger.debug(f"passed cache type={cache_type}") - if cache_type == "redis": + if cache_type == "redis" or cache_type == "redis-semantic": cache_host = litellm.get_secret("REDIS_HOST", None) cache_port = litellm.get_secret("REDIS_PORT", None) cache_password = litellm.get_secret("REDIS_PASSWORD", None) @@ -1162,6 +1162,9 @@ class ProxyConfig: f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}" ) print() # noqa + if cache_type == "redis-semantic": + # by default this should always be async + cache_params.update({"redis_semantic_cache_use_async": True}) # users can pass os.environ/ variables on the proxy - we should read them from the env for key, value in cache_params.items(): @@ -4067,7 +4070,6 @@ def _has_user_setup_sso(): async def shutdown_event(): global prisma_client, master_key, user_custom_auth, user_custom_key_generate if prisma_client: - verbose_proxy_logger.debug("Disconnecting from Prisma") await prisma_client.disconnect() diff --git a/requirements.txt b/requirements.txt index c58eda09a2..6b82c993a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.1.0 # for vertex ai calls From 167b60ace73eaea13e1a6cdd64c1911527e0b8e1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:39:44 -0800 Subject: [PATCH 036/148] (fix) test-semantic caching --- litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index a1a42ff659..cc18dda165 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion(): ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" - # assert response1.choices[0].message == 1 + random_number = random.randint(1, 100000) + + response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response2: {response1}") + assert response1.id == response2.id # test_redis_cache_completion() @@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion(): "content": f"write a one sentence poem about: {random_number}", } ], - max_tokens=20, + max_tokens=5, ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" + random_number = random.randint(1, 100000) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=5, + ) + print(f"response2: {response2}") + assert response1.id == response2.id From bac4ff7ea75dc2f19d0b86d51faf913b8a338f91 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:53:28 -0800 Subject: [PATCH 037/148] (docs) redis cache --- docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 8a580f087c..7b21d35b6c 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -1,11 +1,11 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Caching - In-Memory, Redis, s3 +# Caching - In-Memory, Redis, s3, Redis Semantic Cache [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py) -## Initialize Cache - In Memory, Redis, s3 Bucket +## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache @@ -18,7 +18,7 @@ pip install redis ``` For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ -### Quick Start + ```python import litellm from litellm import completion @@ -55,7 +55,7 @@ Set AWS environment variables AWS_ACCESS_KEY_ID = "AKI*******" AWS_SECRET_ACCESS_KEY = "WOl*****" ``` -### Quick Start + ```python import litellm from litellm import completion @@ -80,6 +80,66 @@ response2 = completion( + + +Install redis +```shell +pip install redisvl==0.0.7 +``` + +For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ + +```python +import litellm +from litellm import completion +from litellm.caching import Cache + +random_number = random.randint( + 1, 100000 +) # add a random number to ensure it's always adding / reading from cache + +print("testing semantic caching") +litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here +) +response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response1: {response1}") + +random_number = random.randint(1, 100000) + +response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response2: {response1}") +assert response1.id == response2.id +# response1 == response2, response 1 is cached +``` + + + + + ### Quick Start From 1e744558e2ade7c13ae02f4896faec9f3c0afda1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:54:55 -0800 Subject: [PATCH 038/148] (docs) litellm semantic caching --- docs/my-website/docs/caching/redis_cache.md | 2 +- docs/my-website/docs/proxy/caching.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 7b21d35b6c..75e1db9557 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -104,7 +104,7 @@ litellm.cache = Cache( host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.8, + similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here ) response1 = completion( diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 3f26878241..d5b589e5c2 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -7,6 +7,7 @@ Cache LLM Responses LiteLLM supports: - In Memory Cache - Redis Cache +- Redis Semantic Cache - s3 Bucket Cache ## Quick Start - Redis, s3 Cache, Semantic Cache From 2df1872e5d406751dd57663397694c9df55c0cb6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:55:15 -0800 Subject: [PATCH 039/148] (fix) semantic caching --- litellm/tests/test_caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index cc18dda165..96fd8eb9d2 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion(): port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", ) response1 = completion( model="gpt-3.5-turbo", From e008f676373e828fdc029f0299fb26a348ce4949 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:57:20 -0800 Subject: [PATCH 040/148] (ci/cd) run in verbose mode --- .circleci/config.yml | 2 +- litellm/tests/test_completion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c1224159a1..9a29ed07ca 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 01f2ae4e54625cbed486349cbdf93d39c6c8c3f6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:04:19 -0800 Subject: [PATCH 041/148] (fix) mark semantic caching as beta test --- litellm/tests/test_caching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 96fd8eb9d2..6cb5b974a1 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -989,6 +989,7 @@ def test_cache_context_managers(): # test_cache_context_managers() +@pytest.mark.skip(reason="beta test - new redis semantic cache") def test_redis_semantic_cache_completion(): litellm.set_verbose = True import logging @@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() +@pytest.mark.skip(reason="beta test - new redis semantic cache") @pytest.mark.asyncio async def test_redis_semantic_cache_acompletion(): litellm.set_verbose = True From fe2f59b8e354765c0f0f03f2ba12a9be0031c3ef Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:27:24 -0800 Subject: [PATCH 042/148] (fix) rename proxy startup test --- litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%) diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py similarity index 100% rename from litellm/tests/test_proxy_startup.py rename to litellm/tests/test_aproxy_startup.py From 5898e61b325d1d3632576fc1c4a3bb9fdce554e1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:38:57 -0800 Subject: [PATCH 043/148] (fix) proxy_startup test --- litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py index a846c9f4a3..024d69b1ff 100644 --- a/litellm/tests/test_aproxy_startup.py +++ b/litellm/tests/test_aproxy_startup.py @@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config(): from litellm._logging import verbose_proxy_logger, verbose_router_logger import logging + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) filepath = os.path.dirname(os.path.abspath(__file__)) @@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url def test_proxy_gunicorn_startup_config_dict(): @@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict(): verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + filepath = os.path.dirname(os.path.abspath(__file__)) # test with worker_config = config yaml config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" @@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url # test_proxy_gunicorn_startup() From b546971039993684a3e5f728054caa60f7868931 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:16 -0800 Subject: [PATCH 044/148] (ci/cd) run pytest without -s --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9a29ed07ca..c1224159a1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results From 04a2e1f61047e18b6e3650116269d261a67a35eb Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:24 -0800 Subject: [PATCH 045/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From c8b2f0fd5d50da28228a0f3f82ed1bd02c92efc6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:43:28 -0800 Subject: [PATCH 046/148] (fix) parallel_request_limiter debug --- litellm/proxy/hooks/parallel_request_limiter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py index ca60421a50..48cf5b7799 100644 --- a/litellm/proxy/hooks/parallel_request_limiter.py +++ b/litellm/proxy/hooks/parallel_request_limiter.py @@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger): "current_rpm": current["current_rpm"] + 1, } - self.print_verbose(f"updated_value in success call: {new_val}") + self.print_verbose( + f"updated_value in success call: {new_val}, precise_minute: {precise_minute}" + ) self.user_api_key_cache.set_cache( request_count_api_key, new_val, ttl=60 ) # store in cache for 1 min. From f33815aaf84fbfbb2e799e53b817a453da026ce9 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:44:30 -0800 Subject: [PATCH 047/148] (fix) test_normal_router_tpm_limit --- litellm/tests/test_parallel_request_limiter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py index 528bb19d2a..bfac8ddeae 100644 --- a/litellm/tests/test_parallel_request_limiter.py +++ b/litellm/tests/test_parallel_request_limiter.py @@ -306,6 +306,10 @@ async def test_normal_router_call(): @pytest.mark.asyncio async def test_normal_router_tpm_limit(): + from litellm._logging import verbose_proxy_logger + import logging + + verbose_proxy_logger.setLevel(level=logging.DEBUG) model_list = [ { "model_name": "azure-model", @@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit(): current_minute = datetime.now().strftime("%M") precise_minute = f"{current_date}-{current_hour}-{current_minute}" request_count_api_key = f"{_api_key}::{precise_minute}::request_count" + print("Test: Checking current_requests for precise_minute=", precise_minute) assert ( parallel_request_handler.user_api_key_cache.get_cache( @@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit(): model="azure-model", messages=[{"role": "user", "content": "Write me a paragraph on the moon"}], metadata={"user_api_key": _api_key}, + mock_response="hello", ) await asyncio.sleep(1) # success is done in a separate thread print(f"response: {response}") From 29a9bd26ea96e93d3660150173bce9c724f7fe87 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:47:19 -0800 Subject: [PATCH 048/148] (ci/cd) fix test_config_no_auth --- .../test_configs/test_config_no_auth.yaml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml index ccebe016db..9d7aff5702 100644 --- a/litellm/tests/test_configs/test_config_no_auth.yaml +++ b/litellm/tests/test_configs/test_config_no_auth.yaml @@ -9,11 +9,21 @@ model_list: api_key: os.environ/AZURE_CANADA_API_KEY model: azure/gpt-35-turbo model_name: azure-model +- litellm_params: + api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1 + api_key: os.environ/AZURE_API_KEY + model: azure/chatgpt-v-2 + model_name: azure-cloudflare-model - litellm_params: api_base: https://openai-france-1234.openai.azure.com api_key: os.environ/AZURE_FRANCE_API_KEY model: azure/gpt-turbo model_name: azure-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + model_name: test_openai_models - litellm_params: model: gpt-3.5-turbo model_info: @@ -26,8 +36,93 @@ model_list: description: this is a test openai model id: 4d1ee26c-abca-450c-8744-8e87fd6755e9 model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 00e19c0f-b63d-42bb-88e9-016fb0c60764 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 79fc75bf-8e1b-47d5-8d24-9365a854af03 + model_name: test_openai_models +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + mode: embedding + model_name: azure-embedding-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 55848c55-4162-40f9-a6e2-9a722b9ef404 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34339b1e-e030-4bcc-a531-c48559f10ce4 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: f6f74e14-ac64-4403-9365-319e584dcdc5 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 9b1ef341-322c-410a-8992-903987fef439 + model_name: test_openai_models - litellm_params: model: bedrock/amazon.titan-embed-text-v1 model_info: mode: embedding model_name: amazon-embeddings +- litellm_params: + model: sagemaker/berri-benchmarking-gpt-j-6b-fp16 + model_info: + mode: embedding + model_name: GPT-J 6B - Sagemaker Text Embedding (Internal) +- litellm_params: + model: dall-e-3 + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_SWEDEN_API_BASE + api_key: os.environ/AZURE_SWEDEN_API_KEY + api_version: 2023-12-01-preview + model: azure/dall-e-3-test + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-06-01-preview + model: azure/ + model_info: + mode: image_generation + model_name: dall-e-2 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + base_model: text-embedding-ada-002 + mode: embedding + model_name: text-embedding-ada-002 +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34cb2419-7c63-44ae-a189-53f1d1ce5953 + model_name: test_openai_models \ No newline at end of file From 5c71590b84021f52d4f457de0682749fe530cbc6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:53:47 -0800 Subject: [PATCH 049/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 97514b6bedaa306a81ab4dd7919d56e96890fe6c Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 12:57:05 -0800 Subject: [PATCH 050/148] fix(proxy_server.py): do a health check on db before returning if proxy ready (if db connected) --- litellm/proxy/proxy_server.py | 18 +++++++++--------- litellm/proxy/utils.py | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 494c874147..bd5b43f5f5 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -4051,16 +4051,16 @@ async def health_readiness(): cache_type = litellm.cache.type if prisma_client is not None: # if db passed in, check if it's connected - if prisma_client.db.is_connected() == True: - response_object = {"db": "connected"} + await prisma_client.health_check() # test the db connection + response_object = {"db": "connected"} - return { - "status": "healthy", - "db": "connected", - "cache": cache_type, - "litellm_version": version, - "success_callbacks": litellm.success_callback, - } + return { + "status": "healthy", + "db": "connected", + "cache": cache_type, + "litellm_version": version, + "success_callbacks": litellm.success_callback, + } else: return { "status": "healthy", diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 84b09d7265..5c5b5b7727 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -472,8 +472,6 @@ class PrismaClient: reset_at: Optional[datetime] = None, ): try: - print_verbose("PrismaClient: get_data") - response: Any = None if token is not None or (table_name is not None and table_name == "key"): # check if plain text or hash @@ -885,6 +883,21 @@ class PrismaClient: ) raise e + async def health_check(self): + """ + Health check endpoint for the prisma client + """ + sql_query = """ + SELECT 1 + FROM "LiteLLM_VerificationToken" + LIMIT 1 + """ + + # Execute the raw query + # The asterisk before `user_id_list` unpacks the list into separate arguments + response = await self.db.query_raw(sql_query) + return response + class DBClient: """ From 1e0f7fe167300ee9735b9020e645dd641a20909f Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:02:36 -0800 Subject: [PATCH 051/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From a8d81b91a7199f15b318e132da7662f3bf0a35c8 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:09:48 -0800 Subject: [PATCH 052/148] (feat) show langfuse logging tags better through proxy --- litellm/integrations/langfuse.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index 82de333660..3c3e793dfb 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -252,8 +252,14 @@ class LangFuseLogger: print_verbose(f"trace: {cost}") if supports_tags: for key, value in metadata.items(): - tags.append(f"{key}:{value}") + if key in [ + "user_api_key", + "user_api_key_user_id", + ]: + tags.append(f"{key}:{value}") if "cache_hit" in kwargs: + if kwargs["cache_hit"] is None: + kwargs["cache_hit"] = False tags.append(f"cache_hit:{kwargs['cache_hit']}") trace_params.update({"tags": tags}) From dccd72fea116323078b9c9e46616c7adaa40e717 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:29 -0800 Subject: [PATCH 053/148] fix(utils.py): round max tokens to be int always --- litellm/tests/test_completion.py | 5 +++-- litellm/utils.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..de79c97afa 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -544,13 +544,13 @@ def hf_test_completion_tgi(): def test_completion_openai(): try: litellm.set_verbose = True + litellm.drop_params = True print(f"api key: {os.environ['OPENAI_API_KEY']}") litellm.api_key = os.environ["OPENAI_API_KEY"] response = completion( model="gpt-3.5-turbo", - messages=messages, + messages=[{"role": "user", "content": "Hey"}], max_tokens=10, - request_timeout=1, metadata={"hi": "bye"}, ) print("This is the response object\n", response) @@ -565,6 +565,7 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None + raise Exception("it works!") except Timeout as e: pass except Exception as e: diff --git a/litellm/utils.py b/litellm/utils.py index 5ccb85ef05..fdca57e51f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2348,7 +2348,9 @@ def client(original_function): elif user_max_tokens + input_tokens > max_output_tokens: user_max_tokens = max_output_tokens - input_tokens print_verbose(f"user_max_tokens: {user_max_tokens}") - kwargs["max_tokens"] = user_max_tokens + kwargs["max_tokens"] = int( + round(user_max_tokens) + ) # make sure max tokens is always an int except Exception as e: print_verbose(f"Error while checking max token limit: {str(e)}") # MODEL CALL From 8c19e8f2eb5660a14510f3e935a09bcb3fe5f5ff Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:49 -0800 Subject: [PATCH 054/148] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?= =?UTF-8?q?1.22.9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17d80ae8ee..944aad7f8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.8" +version = "1.22.9" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.8" +version = "1.22.9" version_files = [ "pyproject.toml:^version" ] From f57c054920994a57460a29e74d2d51b8eef53a8a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:57:20 -0800 Subject: [PATCH 055/148] (ci/cd) run in verbose mode --- .circleci/config.yml | 2 +- litellm/tests/test_completion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c1224159a1..9a29ed07ca 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 59ea06d9c98858e210939c5dd23b5f3a337d9256 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:27:24 -0800 Subject: [PATCH 056/148] (fix) rename proxy startup test --- litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%) diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py similarity index 100% rename from litellm/tests/test_proxy_startup.py rename to litellm/tests/test_aproxy_startup.py From 8d9c51b50ef6b2f0632b26c2c5224c968d5e9ca5 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:38:57 -0800 Subject: [PATCH 057/148] (fix) proxy_startup test --- litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py index a846c9f4a3..024d69b1ff 100644 --- a/litellm/tests/test_aproxy_startup.py +++ b/litellm/tests/test_aproxy_startup.py @@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config(): from litellm._logging import verbose_proxy_logger, verbose_router_logger import logging + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) filepath = os.path.dirname(os.path.abspath(__file__)) @@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url def test_proxy_gunicorn_startup_config_dict(): @@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict(): verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + filepath = os.path.dirname(os.path.abspath(__file__)) # test with worker_config = config yaml config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" @@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url # test_proxy_gunicorn_startup() From ddfcccda38a8df37e0539511ffffa6b6a218dc88 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:16 -0800 Subject: [PATCH 058/148] (ci/cd) run pytest without -s --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9a29ed07ca..c1224159a1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results From f81f73f3b5341c073f91b9a16bdd57408133bdcf Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:24 -0800 Subject: [PATCH 059/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 86a44c4a3dc7436ca0df69a78579393ecf41386d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:43:28 -0800 Subject: [PATCH 060/148] (fix) parallel_request_limiter debug --- litellm/proxy/hooks/parallel_request_limiter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py index ca60421a50..48cf5b7799 100644 --- a/litellm/proxy/hooks/parallel_request_limiter.py +++ b/litellm/proxy/hooks/parallel_request_limiter.py @@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger): "current_rpm": current["current_rpm"] + 1, } - self.print_verbose(f"updated_value in success call: {new_val}") + self.print_verbose( + f"updated_value in success call: {new_val}, precise_minute: {precise_minute}" + ) self.user_api_key_cache.set_cache( request_count_api_key, new_val, ttl=60 ) # store in cache for 1 min. From d00ed06744a8e4fe22e5448ff280fa1255bdae9f Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:44:30 -0800 Subject: [PATCH 061/148] (fix) test_normal_router_tpm_limit --- litellm/tests/test_parallel_request_limiter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py index 528bb19d2a..bfac8ddeae 100644 --- a/litellm/tests/test_parallel_request_limiter.py +++ b/litellm/tests/test_parallel_request_limiter.py @@ -306,6 +306,10 @@ async def test_normal_router_call(): @pytest.mark.asyncio async def test_normal_router_tpm_limit(): + from litellm._logging import verbose_proxy_logger + import logging + + verbose_proxy_logger.setLevel(level=logging.DEBUG) model_list = [ { "model_name": "azure-model", @@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit(): current_minute = datetime.now().strftime("%M") precise_minute = f"{current_date}-{current_hour}-{current_minute}" request_count_api_key = f"{_api_key}::{precise_minute}::request_count" + print("Test: Checking current_requests for precise_minute=", precise_minute) assert ( parallel_request_handler.user_api_key_cache.get_cache( @@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit(): model="azure-model", messages=[{"role": "user", "content": "Write me a paragraph on the moon"}], metadata={"user_api_key": _api_key}, + mock_response="hello", ) await asyncio.sleep(1) # success is done in a separate thread print(f"response: {response}") From d359465cccb5bdd7d4c5e5e45596c655f505adf9 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:47:19 -0800 Subject: [PATCH 062/148] (ci/cd) fix test_config_no_auth --- .../test_configs/test_config_no_auth.yaml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml index ccebe016db..9d7aff5702 100644 --- a/litellm/tests/test_configs/test_config_no_auth.yaml +++ b/litellm/tests/test_configs/test_config_no_auth.yaml @@ -9,11 +9,21 @@ model_list: api_key: os.environ/AZURE_CANADA_API_KEY model: azure/gpt-35-turbo model_name: azure-model +- litellm_params: + api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1 + api_key: os.environ/AZURE_API_KEY + model: azure/chatgpt-v-2 + model_name: azure-cloudflare-model - litellm_params: api_base: https://openai-france-1234.openai.azure.com api_key: os.environ/AZURE_FRANCE_API_KEY model: azure/gpt-turbo model_name: azure-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + model_name: test_openai_models - litellm_params: model: gpt-3.5-turbo model_info: @@ -26,8 +36,93 @@ model_list: description: this is a test openai model id: 4d1ee26c-abca-450c-8744-8e87fd6755e9 model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 00e19c0f-b63d-42bb-88e9-016fb0c60764 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 79fc75bf-8e1b-47d5-8d24-9365a854af03 + model_name: test_openai_models +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + mode: embedding + model_name: azure-embedding-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 55848c55-4162-40f9-a6e2-9a722b9ef404 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34339b1e-e030-4bcc-a531-c48559f10ce4 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: f6f74e14-ac64-4403-9365-319e584dcdc5 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 9b1ef341-322c-410a-8992-903987fef439 + model_name: test_openai_models - litellm_params: model: bedrock/amazon.titan-embed-text-v1 model_info: mode: embedding model_name: amazon-embeddings +- litellm_params: + model: sagemaker/berri-benchmarking-gpt-j-6b-fp16 + model_info: + mode: embedding + model_name: GPT-J 6B - Sagemaker Text Embedding (Internal) +- litellm_params: + model: dall-e-3 + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_SWEDEN_API_BASE + api_key: os.environ/AZURE_SWEDEN_API_KEY + api_version: 2023-12-01-preview + model: azure/dall-e-3-test + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-06-01-preview + model: azure/ + model_info: + mode: image_generation + model_name: dall-e-2 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + base_model: text-embedding-ada-002 + mode: embedding + model_name: text-embedding-ada-002 +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34cb2419-7c63-44ae-a189-53f1d1ce5953 + model_name: test_openai_models \ No newline at end of file From 3fc1ff0c73467ad04210a1e9d6242fd8b93bd7d0 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:53:47 -0800 Subject: [PATCH 063/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From f2b56be491cc8b879f5fdaeec9ed32fc3535daf1 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:02:36 -0800 Subject: [PATCH 064/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 50efa6a76dccec068a26fe4facbad9b0440d89c0 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:29 -0800 Subject: [PATCH 065/148] fix(utils.py): round max tokens to be int always --- litellm/tests/test_completion.py | 5 +++-- litellm/utils.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..de79c97afa 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -544,13 +544,13 @@ def hf_test_completion_tgi(): def test_completion_openai(): try: litellm.set_verbose = True + litellm.drop_params = True print(f"api key: {os.environ['OPENAI_API_KEY']}") litellm.api_key = os.environ["OPENAI_API_KEY"] response = completion( model="gpt-3.5-turbo", - messages=messages, + messages=[{"role": "user", "content": "Hey"}], max_tokens=10, - request_timeout=1, metadata={"hi": "bye"}, ) print("This is the response object\n", response) @@ -565,6 +565,7 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None + raise Exception("it works!") except Timeout as e: pass except Exception as e: diff --git a/litellm/utils.py b/litellm/utils.py index 5ccb85ef05..fdca57e51f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2348,7 +2348,9 @@ def client(original_function): elif user_max_tokens + input_tokens > max_output_tokens: user_max_tokens = max_output_tokens - input_tokens print_verbose(f"user_max_tokens: {user_max_tokens}") - kwargs["max_tokens"] = user_max_tokens + kwargs["max_tokens"] = int( + round(user_max_tokens) + ) # make sure max tokens is always an int except Exception as e: print_verbose(f"Error while checking max token limit: {str(e)}") # MODEL CALL From c97bb22f909a6e76b277fbc1a471abe617fd0f67 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:49 -0800 Subject: [PATCH 066/148] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?= =?UTF-8?q?1.22.9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17d80ae8ee..944aad7f8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.8" +version = "1.22.9" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.8" +version = "1.22.9" version_files = [ "pyproject.toml:^version" ] From 1f4b2e34b92c47034b4552bb334f5dad234261a1 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 07:35:46 -0800 Subject: [PATCH 067/148] build(requirements.txt): update the proxy requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c9bd0e511d..768e8dff3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions -google-generativeai==0.1.0 # for vertex ai calls +google-generativeai==0.3.2 # for vertex ai calls async_generator==1.10.0 # for async ollama calls traceloop-sdk==0.5.3 # for open telemetry logging langfuse>=2.6.3 # for langfuse self-hosted logging From 87a92aa65ebefe7752c2601a2de4954df815fc0e Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 10:11:43 -0800 Subject: [PATCH 068/148] fix(ollama.py): support format for ollama --- litellm/llms/ollama.py | 10 +++++++++- litellm/llms/ollama_chat.py | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index d0bc24af4c..9339deb78d 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -146,7 +146,15 @@ def get_ollama_response( optional_params[k] = v stream = optional_params.pop("stream", False) - data = {"model": model, "prompt": prompt, "options": optional_params} + format = optional_params.pop("format", None) + data = { + "model": model, + "prompt": prompt, + "options": optional_params, + "stream": stream, + } + if format is not None: + data["format"] = format ## LOGGING logging_obj.pre_call( diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index d1a439398b..0311931b13 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -146,12 +146,15 @@ def get_ollama_response( optional_params[k] = v stream = optional_params.pop("stream", False) + format = optional_params.pop("format", None) data = { "model": model, "messages": messages, "options": optional_params, "stream": stream, } + if format is not None: + data["format"] = format ## LOGGING logging_obj.pre_call( input=None, From 705f968136a61a0b25413dfdc5bda65d3e11d72f Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 10:12:13 -0800 Subject: [PATCH 069/148] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=20?= =?UTF-8?q?1.22.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be8c8966be..17d80ae8ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.7" +version = "1.22.8" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.7" +version = "1.22.8" version_files = [ "pyproject.toml:^version" ] From f5f44e8bb9a246d2632c3246f0057810cab9ce66 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:57:20 -0800 Subject: [PATCH 070/148] (ci/cd) run in verbose mode --- .circleci/config.yml | 2 +- litellm/tests/test_completion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c1224159a1..9a29ed07ca 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 34937c23aed53bf062455c08835f2f3c75ed62f4 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:27:24 -0800 Subject: [PATCH 071/148] (fix) rename proxy startup test --- litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%) diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py similarity index 100% rename from litellm/tests/test_proxy_startup.py rename to litellm/tests/test_aproxy_startup.py From 69064b033b925616671c3b574546520f2668e57a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:38:57 -0800 Subject: [PATCH 072/148] (fix) proxy_startup test --- litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py index a846c9f4a3..024d69b1ff 100644 --- a/litellm/tests/test_aproxy_startup.py +++ b/litellm/tests/test_aproxy_startup.py @@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config(): from litellm._logging import verbose_proxy_logger, verbose_router_logger import logging + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) filepath = os.path.dirname(os.path.abspath(__file__)) @@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url def test_proxy_gunicorn_startup_config_dict(): @@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict(): verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_router_logger.setLevel(level=logging.DEBUG) + # unset set DATABASE_URL in env for this test + # set prisma client to None + setattr(litellm.proxy.proxy_server, "prisma_client", None) + database_url = os.environ.pop("DATABASE_URL", None) + filepath = os.path.dirname(os.path.abspath(__file__)) # test with worker_config = config yaml config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" @@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + finally: + # restore DATABASE_URL after the test + if database_url is not None: + os.environ["DATABASE_URL"] = database_url # test_proxy_gunicorn_startup() From a804fb7db8854d86981c5a027f50fbdfd2a2e7ce Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:16 -0800 Subject: [PATCH 073/148] (ci/cd) run pytest without -s --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9a29ed07ca..c1224159a1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,7 @@ jobs: command: | pwd ls - python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results From ef27d1293eab59915748dcdc1f6853b684e5cf68 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:22:24 -0800 Subject: [PATCH 074/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 29a6f8b44963c376213d53f66956d2328e36914b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:43:28 -0800 Subject: [PATCH 075/148] (fix) parallel_request_limiter debug --- litellm/proxy/hooks/parallel_request_limiter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py index ca60421a50..48cf5b7799 100644 --- a/litellm/proxy/hooks/parallel_request_limiter.py +++ b/litellm/proxy/hooks/parallel_request_limiter.py @@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger): "current_rpm": current["current_rpm"] + 1, } - self.print_verbose(f"updated_value in success call: {new_val}") + self.print_verbose( + f"updated_value in success call: {new_val}, precise_minute: {precise_minute}" + ) self.user_api_key_cache.set_cache( request_count_api_key, new_val, ttl=60 ) # store in cache for 1 min. From 61a4f4f948bf015084d30e5d3d78782df2c112f6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:44:30 -0800 Subject: [PATCH 076/148] (fix) test_normal_router_tpm_limit --- litellm/tests/test_parallel_request_limiter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py index 528bb19d2a..bfac8ddeae 100644 --- a/litellm/tests/test_parallel_request_limiter.py +++ b/litellm/tests/test_parallel_request_limiter.py @@ -306,6 +306,10 @@ async def test_normal_router_call(): @pytest.mark.asyncio async def test_normal_router_tpm_limit(): + from litellm._logging import verbose_proxy_logger + import logging + + verbose_proxy_logger.setLevel(level=logging.DEBUG) model_list = [ { "model_name": "azure-model", @@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit(): current_minute = datetime.now().strftime("%M") precise_minute = f"{current_date}-{current_hour}-{current_minute}" request_count_api_key = f"{_api_key}::{precise_minute}::request_count" + print("Test: Checking current_requests for precise_minute=", precise_minute) assert ( parallel_request_handler.user_api_key_cache.get_cache( @@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit(): model="azure-model", messages=[{"role": "user", "content": "Write me a paragraph on the moon"}], metadata={"user_api_key": _api_key}, + mock_response="hello", ) await asyncio.sleep(1) # success is done in a separate thread print(f"response: {response}") From e6fb8250557bdf68bf4478c0467ac5edff06a978 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:47:19 -0800 Subject: [PATCH 077/148] (ci/cd) fix test_config_no_auth --- .../test_configs/test_config_no_auth.yaml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml index ccebe016db..9d7aff5702 100644 --- a/litellm/tests/test_configs/test_config_no_auth.yaml +++ b/litellm/tests/test_configs/test_config_no_auth.yaml @@ -9,11 +9,21 @@ model_list: api_key: os.environ/AZURE_CANADA_API_KEY model: azure/gpt-35-turbo model_name: azure-model +- litellm_params: + api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1 + api_key: os.environ/AZURE_API_KEY + model: azure/chatgpt-v-2 + model_name: azure-cloudflare-model - litellm_params: api_base: https://openai-france-1234.openai.azure.com api_key: os.environ/AZURE_FRANCE_API_KEY model: azure/gpt-turbo model_name: azure-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + model_name: test_openai_models - litellm_params: model: gpt-3.5-turbo model_info: @@ -26,8 +36,93 @@ model_list: description: this is a test openai model id: 4d1ee26c-abca-450c-8744-8e87fd6755e9 model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 00e19c0f-b63d-42bb-88e9-016fb0c60764 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 79fc75bf-8e1b-47d5-8d24-9365a854af03 + model_name: test_openai_models +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + mode: embedding + model_name: azure-embedding-model +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 55848c55-4162-40f9-a6e2-9a722b9ef404 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34339b1e-e030-4bcc-a531-c48559f10ce4 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: f6f74e14-ac64-4403-9365-319e584dcdc5 + model_name: test_openai_models +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 9b1ef341-322c-410a-8992-903987fef439 + model_name: test_openai_models - litellm_params: model: bedrock/amazon.titan-embed-text-v1 model_info: mode: embedding model_name: amazon-embeddings +- litellm_params: + model: sagemaker/berri-benchmarking-gpt-j-6b-fp16 + model_info: + mode: embedding + model_name: GPT-J 6B - Sagemaker Text Embedding (Internal) +- litellm_params: + model: dall-e-3 + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_SWEDEN_API_BASE + api_key: os.environ/AZURE_SWEDEN_API_KEY + api_version: 2023-12-01-preview + model: azure/dall-e-3-test + model_info: + mode: image_generation + model_name: dall-e-3 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-06-01-preview + model: azure/ + model_info: + mode: image_generation + model_name: dall-e-2 +- litellm_params: + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: 2023-07-01-preview + model: azure/azure-embedding-model + model_info: + base_model: text-embedding-ada-002 + mode: embedding + model_name: text-embedding-ada-002 +- litellm_params: + model: gpt-3.5-turbo + model_info: + description: this is a test openai model + id: 34cb2419-7c63-44ae-a189-53f1d1ce5953 + model_name: test_openai_models \ No newline at end of file From 9bef2a94d0e2ec65dfd3891f73250b430cd77013 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 12:53:47 -0800 Subject: [PATCH 078/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..e0ee05d4f4 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From ee0f5793dc52fb97281c5879edb5126a80fac14a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:02:36 -0800 Subject: [PATCH 079/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e0ee05d4f4..bd0301f204 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the,response + # Add any assertions here to check the, response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 659a460923b83156c398d1e83915ee251c1bf3df Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:29 -0800 Subject: [PATCH 080/148] fix(utils.py): round max tokens to be int always --- litellm/tests/test_completion.py | 5 +++-- litellm/utils.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bd0301f204..de79c97afa 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -544,13 +544,13 @@ def hf_test_completion_tgi(): def test_completion_openai(): try: litellm.set_verbose = True + litellm.drop_params = True print(f"api key: {os.environ['OPENAI_API_KEY']}") litellm.api_key = os.environ["OPENAI_API_KEY"] response = completion( model="gpt-3.5-turbo", - messages=messages, + messages=[{"role": "user", "content": "Hey"}], max_tokens=10, - request_timeout=1, metadata={"hi": "bye"}, ) print("This is the response object\n", response) @@ -565,6 +565,7 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None + raise Exception("it works!") except Timeout as e: pass except Exception as e: diff --git a/litellm/utils.py b/litellm/utils.py index 31eeaacab4..62315b3d97 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2350,7 +2350,9 @@ def client(original_function): elif user_max_tokens + input_tokens > max_output_tokens: user_max_tokens = max_output_tokens - input_tokens print_verbose(f"user_max_tokens: {user_max_tokens}") - kwargs["max_tokens"] = user_max_tokens + kwargs["max_tokens"] = int( + round(user_max_tokens) + ) # make sure max tokens is always an int except Exception as e: print_verbose(f"Error while checking max token limit: {str(e)}") # MODEL CALL From 955dbb179c152b05b53019becba7e222ee30c24f Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:10:49 -0800 Subject: [PATCH 081/148] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?= =?UTF-8?q?1.22.9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17d80ae8ee..944aad7f8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.22.8" +version = "1.22.9" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.22.8" +version = "1.22.9" version_files = [ "pyproject.toml:^version" ] From be53cbc45ffbb99e69b6e72a445f76e540669a24 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:26:48 -0800 Subject: [PATCH 082/148] (ci/cd) run again --- litellm/tests/test_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 6cb5b974a1..8433941e90 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 - ) # add a random number to ensure it's always adding / reading from cache + ) # add a random number to ensure it's always adding /reading from cache print("testing semantic caching") litellm.cache = Cache( From 84a60c7957688070b84f434b665aa4178ddcb35b Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:29:31 -0800 Subject: [PATCH 083/148] test(test_completion.py): fix test --- docs/my-website/docs/proxy/caching.md | 7 ++++--- litellm/tests/test_completion.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index d5b589e5c2..2b385de8e5 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -211,9 +211,10 @@ litellm_settings: The proxy support 3 cache-controls: -- `ttl`: Will cache the response for the user-defined amount of time (in seconds). -- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds). -- `no-cache`: Will not return a cached response, but instead call the actual endpoint. +- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds). +- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. +- `no-store`: *Optional(bool)* Will not cache the response. [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index de79c97afa..b075e48190 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -565,7 +565,6 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None - raise Exception("it works!") except Timeout as e: pass except Exception as e: From bdb1f596d5aa56783f9123dc5c9c2050601c8e5d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:09:48 -0800 Subject: [PATCH 084/148] (feat) show langfuse logging tags better through proxy --- litellm/integrations/langfuse.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index 82de333660..3c3e793dfb 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -252,8 +252,14 @@ class LangFuseLogger: print_verbose(f"trace: {cost}") if supports_tags: for key, value in metadata.items(): - tags.append(f"{key}:{value}") + if key in [ + "user_api_key", + "user_api_key_user_id", + ]: + tags.append(f"{key}:{value}") if "cache_hit" in kwargs: + if kwargs["cache_hit"] is None: + kwargs["cache_hit"] = False tags.append(f"cache_hit:{kwargs['cache_hit']}") trace_params.update({"tags": tags}) From bf020fcf33ba05a1b48b971e364d3266c769c51c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 12:28:21 -0800 Subject: [PATCH 085/148] (feat )add semantic cache --- litellm/caching.py | 102 +++++++++++++++++++++++++++++++++- litellm/tests/test_caching.py | 25 +++++++++ 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index d0721fe9a9..e1ef95dc34 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -83,7 +83,6 @@ class InMemoryCache(BaseCache): self.cache_dict.clear() self.ttl_dict.clear() - async def disconnect(self): pass @@ -217,7 +216,6 @@ class RedisCache(BaseCache): def flush_cache(self): self.redis_client.flushall() - async def disconnect(self): pass @@ -225,6 +223,102 @@ class RedisCache(BaseCache): self.redis_client.delete(key) +class RedisSemanticCache(RedisCache): + def __init__(self, host, port, password, **kwargs): + super().__init__() + + # from redis.commands.search.field import TagField, TextField, NumericField, VectorField + # from redis.commands.search.indexDefinition import IndexDefinition, IndexType + # from redis.commands.search.query import Query + + # INDEX_NAME = 'idx:litellm_completion_response_vss' + # DOC_PREFIX = 'bikes:' + + # try: + # # check to see if index exists + # client.ft(INDEX_NAME).info() + # print('Index already exists!') + # except: + # # schema + # schema = ( + # TextField('$.model', no_stem=True, as_name='model'), + # TextField('$.brand', no_stem=True, as_name='brand'), + # NumericField('$.price', as_name='price'), + # TagField('$.type', as_name='type'), + # TextField('$.description', as_name='description'), + # VectorField('$.description_embeddings', + # 'FLAT', { + # 'TYPE': 'FLOAT32', + # 'DIM': VECTOR_DIMENSION, + # 'DISTANCE_METRIC': 'COSINE', + # }, as_name='vector' + # ), + # ) + + # # index Definition + # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) + + # # create Index + # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + + def set_cache(self, key, value, **kwargs): + ttl = kwargs.get("ttl", None) + print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") + try: + # get text response + # print("in redis semantic cache: value: ", value) + llm_response = value["response"] + + # if llm_response is a string, convert it to a dictionary + if isinstance(llm_response, str): + llm_response = json.loads(llm_response) + + # print("converted llm_response: ", llm_response) + response = llm_response["choices"][0]["message"]["content"] + + # create embedding response + + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=response, + cache={"no-store": True}, + ) + + raw_embedding = embedding_response["data"][0]["embedding"] + raw_embedding_dimension = len(raw_embedding) + + # print("embedding: ", raw_embedding) + key = "litellm-semantic:" + key + self.redis_client.json().set( + name=key, + path="$", + obj=json.dumps( + { + "response": response, + "embedding": raw_embedding, + "dimension": raw_embedding_dimension, + } + ), + ) + + stored_redis_value = self.redis_client.json().get(name=key) + + # print("Stored Redis Value: ", stored_redis_value) + + except Exception as e: + # print("Error occurred: ", e) + # NON blocking - notify users Redis is throwing an exception + logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + + def get_cache(self, key, **kwargs): + pass + + async def async_set_cache(self, key, value, **kwargs): + pass + + async def async_get_cache(self, key, **kwargs): + pass + class S3Cache(BaseCache): def __init__( @@ -429,7 +523,7 @@ class DualCache(BaseCache): class Cache: def __init__( self, - type: Optional[Literal["local", "redis", "s3"]] = "local", + type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local", host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, @@ -468,6 +562,8 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) + elif type == "redis-semantic": + self.cache = RedisSemanticCache(host, port, password, **kwargs) elif type == "local": self.cache = InMemoryCache() elif type == "s3": diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 468ab6f80f..32904ab784 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -987,3 +987,28 @@ def test_cache_context_managers(): # test_cache_context_managers() + + +def test_redis_semantic_cache_completion(): + litellm.set_verbose = False + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + messages = [ + {"role": "user", "content": f"write a one sentence poem about: {random_number}"} + ] + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + ) + print("test2 for Redis Caching - non streaming") + response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) + # response2 = completion( + # model="gpt-3.5-turbo", messages=messages,max_tokens=20 + # ) + + +# test_redis_cache_completion() From dcd091164d945c001614438ef4e0465edc75d364 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:12 -0800 Subject: [PATCH 086/148] (feat) working - sync semantic caching --- litellm/caching.py | 227 ++++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 75 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index e1ef95dc34..0a1046f0d8 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -223,94 +223,161 @@ class RedisCache(BaseCache): self.redis_client.delete(key) -class RedisSemanticCache(RedisCache): - def __init__(self, host, port, password, **kwargs): - super().__init__() +class RedisSemanticCache(BaseCache): + def __init__( + self, + host=None, + port=None, + password=None, + redis_url=None, + similarity_threshold=None, + **kwargs, + ): + from redisvl.index import SearchIndex + from redisvl.query import VectorQuery - # from redis.commands.search.field import TagField, TextField, NumericField, VectorField - # from redis.commands.search.indexDefinition import IndexDefinition, IndexType - # from redis.commands.search.query import Query + print_verbose( + "redis semantic-cache initializing INDEX - litellm_semantic_cache_index" + ) + if similarity_threshold is None: + raise Exception("similarity_threshold must be provided, passed None") + self.similarity_threshold = similarity_threshold + schema = { + "index": { + "name": "litellm_semantic_cache_index", + "prefix": "litellm", + "storage_type": "hash", + }, + "fields": { + "text": [{"name": "response"}], + "text": [{"name": "prompt"}], + "vector": [ + { + "name": "litellm_embedding", + "dims": 1536, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + } + ], + }, + } + self.index = SearchIndex.from_dict(schema) + if redis_url is None: + # if no url passed, check if host, port and password are passed, if not raise an Exception + if host is None or port is None or password is None: + raise Exception(f"Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port + print_verbose(f"redis semantic-cache redis_url: {redis_url}") + self.index.connect(redis_url=redis_url) + self.index.create(overwrite=False) # don't overwrite existing index - # INDEX_NAME = 'idx:litellm_completion_response_vss' - # DOC_PREFIX = 'bikes:' + def _get_cache_logic(self, cached_response: Any): + """ + Common 'get_cache_logic' across sync + async redis client implementations + """ + if cached_response is None: + return cached_response - # try: - # # check to see if index exists - # client.ft(INDEX_NAME).info() - # print('Index already exists!') - # except: - # # schema - # schema = ( - # TextField('$.model', no_stem=True, as_name='model'), - # TextField('$.brand', no_stem=True, as_name='brand'), - # NumericField('$.price', as_name='price'), - # TagField('$.type', as_name='type'), - # TextField('$.description', as_name='description'), - # VectorField('$.description_embeddings', - # 'FLAT', { - # 'TYPE': 'FLOAT32', - # 'DIM': VECTOR_DIMENSION, - # 'DISTANCE_METRIC': 'COSINE', - # }, as_name='vector' - # ), - # ) + # check if cached_response is bytes + if isinstance(cached_response, bytes): + cached_response = cached_response.decode("utf-8") - # # index Definition - # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) - - # # create Index - # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + try: + cached_response = json.loads( + cached_response + ) # Convert string to dictionary + except: + cached_response = ast.literal_eval(cached_response) + return cached_response def set_cache(self, key, value, **kwargs): - ttl = kwargs.get("ttl", None) - print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") - try: - # get text response - # print("in redis semantic cache: value: ", value) - llm_response = value["response"] + import numpy as np - # if llm_response is a string, convert it to a dictionary - if isinstance(llm_response, str): - llm_response = json.loads(llm_response) + print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}") - # print("converted llm_response: ", llm_response) - response = llm_response["choices"][0]["message"]["content"] + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] - # create embedding response + # create an embedding for prompt + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) - embedding_response = litellm.embedding( - model="text-embedding-ada-002", - input=response, - cache={"no-store": True}, - ) + # get the embedding + embedding = embedding_response["data"][0]["embedding"] - raw_embedding = embedding_response["data"][0]["embedding"] - raw_embedding_dimension = len(raw_embedding) + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) - # print("embedding: ", raw_embedding) - key = "litellm-semantic:" + key - self.redis_client.json().set( - name=key, - path="$", - obj=json.dumps( - { - "response": response, - "embedding": raw_embedding, - "dimension": raw_embedding_dimension, - } - ), - ) + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] - stored_redis_value = self.redis_client.json().get(name=key) + # Add more data + keys = self.index.load(new_data) - # print("Stored Redis Value: ", stored_redis_value) - - except Exception as e: - # print("Error occurred: ", e) - # NON blocking - notify users Redis is throwing an exception - logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + pass def get_cache(self, key, **kwargs): + print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + num_results=1, + ) + + results = self.index.query(query) + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None + pass async def async_set_cache(self, key, value, **kwargs): @@ -527,6 +594,7 @@ class Cache: host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, + similarity_threshold: Optional[float] = None, supported_call_types: Optional[ List[Literal["completion", "acompletion", "embedding", "aembedding"]] ] = ["completion", "acompletion", "embedding", "aembedding"], @@ -547,10 +615,12 @@ class Cache: Initializes the cache based on the given type. Args: - type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local". + type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local". host (str, optional): The host address for the Redis cache. Required if type is "redis". port (int, optional): The port number for the Redis cache. Required if type is "redis". password (str, optional): The password for the Redis cache. Required if type is "redis". + similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic" + supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types. **kwargs: Additional keyword arguments for redis.Redis() cache @@ -563,7 +633,13 @@ class Cache: if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) elif type == "redis-semantic": - self.cache = RedisSemanticCache(host, port, password, **kwargs) + self.cache = RedisSemanticCache( + host, + port, + password, + similarity_threshold=similarity_threshold, + **kwargs, + ) elif type == "local": self.cache = InMemoryCache() elif type == "s3": @@ -743,6 +819,7 @@ class Cache: The cached result if it exists, otherwise None. """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -752,7 +829,7 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = self.cache.get_cache(cache_key) + cached_result = self.cache.get_cache(cache_key, messages=messages) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From d7116c4c5c3321dc33ddc8c9c68197fa9d349c03 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:32 -0800 Subject: [PATCH 087/148] (test) semantic cache --- litellm/tests/test_caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 32904ab784..3ac812cf35 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -990,7 +990,7 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): - litellm.set_verbose = False + litellm.set_verbose = True random_number = random.randint( 1, 100000 @@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion(): host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.5, ) print("test2 for Redis Caching - non streaming") response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) From ab4b31d45baaafe93128beaafb9f7da28f920c0e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:22:50 -0800 Subject: [PATCH 088/148] (test) semantic caching --- litellm/tests/test_caching.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 3ac812cf35..4b47614cca 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 ) # add a random number to ensure it's always adding / reading from cache - messages = [ - {"role": "user", "content": f"write a one sentence poem about: {random_number}"} - ] + + print("testing semantic caching") litellm.cache = Cache( type="redis-semantic", host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.5, + similarity_threshold=0.8, ) - print("test2 for Redis Caching - non streaming") - response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) - # response2 = completion( - # model="gpt-3.5-turbo", messages=messages,max_tokens=20 - # ) + response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" + # assert response1.choices[0].message == 1 # test_redis_cache_completion() From 705531da10cfb7da496b5b97e4d562ec06357287 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:25:22 -0800 Subject: [PATCH 089/148] (fix) semantic cache --- litellm/caching.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index 0a1046f0d8..877f935fab 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache): redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") self.index.connect(redis_url=redis_url) - self.index.create(overwrite=False) # don't overwrite existing index + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") def _get_cache_logic(self, cached_response: Any): """ From 30a209223bb74efa914c88eae7a717424e913367 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:13:12 -0800 Subject: [PATCH 090/148] (feat) RedisSemanticCache - async --- litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 877f935fab..ad37f2077c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache): password=None, redis_url=None, similarity_threshold=None, + use_async=False, **kwargs, ): from redisvl.index import SearchIndex @@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache): ], }, } - self.index = SearchIndex.from_dict(schema) if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: raise Exception(f"Redis host, port, and password must be provided") redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") - self.index.connect(redis_url=redis_url) + if use_async == False: + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url) + elif use_async == True: + schema["index"]["name"] = "litellm_semantic_cache_index_async" + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url, use_async=True) try: self.index.create(overwrite=False) # don't overwrite existing index except Exception as e: @@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache): # Add more data keys = self.index.load(new_data) - pass + return def get_cache(self, key, **kwargs): - print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np @@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache): ) results = self.index.query(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None vector_distance = results[0]["vector_distance"] vector_distance = float(vector_distance) @@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache): pass async def async_set_cache(self, key, value, **kwargs): - pass + import numpy as np + + print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") + + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + # create an embedding for prompt + + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) + + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] + + # Add more data + keys = await self.index.aload(new_data) + return async def async_get_cache(self, key, **kwargs): + print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + ) + results = await self.index.aquery(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None pass @@ -612,6 +707,7 @@ class Cache: s3_aws_secret_access_key: Optional[str] = None, s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, + redis_semantic_cache_use_async=False, **kwargs, ): """ @@ -641,6 +737,7 @@ class Cache: port, password, similarity_threshold=similarity_threshold, + use_async=redis_semantic_cache_use_async, **kwargs, ) elif type == "local": @@ -847,6 +944,7 @@ class Cache: Used for embedding calls in async wrapper """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -856,7 +954,9 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = await self.cache.async_get_cache(cache_key) + cached_result = await self.cache.async_get_cache( + cache_key, messages=messages + ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From c9c3dbf3d43d768e2bae7c7b7c2a4bdae3adc2a4 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:14:54 -0800 Subject: [PATCH 091/148] (test) async semantic cache --- litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 4b47614cca..a1a42ff659 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -991,6 +991,9 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) random_number = random.randint( 1, 100000 @@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() + + +@pytest.mark.asyncio +async def test_redis_semantic_cache_acompletion(): + litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + + print("testing semantic caching") + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_use_async=True, + ) + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" From 6d7909685282bff82ba12fbd12f1ef9ee8d1af95 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:52:57 -0800 Subject: [PATCH 092/148] (feat) working semantic-cache on litellm proxy --- litellm/caching.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index ad37f2077c..a7958d074c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache): if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: - raise Exception(f"Redis host, port, and password must be provided") + # try checking env for host, port and password + import os + + host = os.getenv("REDIS_HOST") + port = os.getenv("REDIS_PORT") + password = os.getenv("REDIS_PASSWORD") + if host is None or port is None or password is None: + raise Exception("Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") if use_async == False: self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url) + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") elif use_async == True: schema["index"]["name"] = "litellm_semantic_cache_index_async" self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url, use_async=True) - try: - self.index.create(overwrite=False) # don't overwrite existing index - except Exception as e: - print_verbose(f"Got exception creating semantic cache index: {str(e)}") + # def _get_cache_logic(self, cached_response: Any): """ Common 'get_cache_logic' across sync + async redis client implementations @@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + try: + await self.index.acreate(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") # get the prompt From 268cec0db111fca9af514a2cddb3667f0dc43948 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:54:36 -0800 Subject: [PATCH 093/148] (feat) redis-semantic cache --- litellm/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index fdca57e51f..c25572c03c 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger from .proxy._types import KeyManagementSystem from openai import OpenAIError as OriginalError from openai._models import BaseModel as OpenAIObject -from .caching import S3Cache +from .caching import S3Cache, RedisSemanticCache from .exceptions import ( AuthenticationError, BadRequestError, @@ -2533,6 +2533,14 @@ def client(original_function): ): if len(cached_result) == 1 and cached_result[0] is None: cached_result = None + elif isinstance(litellm.cache.cache, RedisSemanticCache): + preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) + kwargs[ + "preset_cache_key" + ] = preset_cache_key # for streaming calls, we need to pass the preset_cache_key + cached_result = await litellm.cache.async_get_cache( + *args, **kwargs + ) else: preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) kwargs[ From 4fd38dd9448b7cced755f13fb712d6a3c1c0cbb6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:55:25 -0800 Subject: [PATCH 094/148] (feat) working semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index bd844bd7ba..41c3b41828 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -73,10 +73,12 @@ litellm_settings: max_budget: 1.5000 models: ["azure-gpt-3.5"] duration: None - upperbound_key_generate_params: - max_budget: 100 - duration: "30d" - # cache: True + cache: True # set cache responses to True + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 + redis_semantic_cache_use_async: True + # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From af75d1076e6d80492bbcd6ef50bd61b887b0eba2 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 095/148] (fix) add redisvl==0.0.7 --- .circleci/requirements.txt | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index 85b576bff2..4730fc28b1 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -10,4 +10,5 @@ anthropic boto3 orjson pydantic -google-cloud-aiplatform \ No newline at end of file +google-cloud-aiplatform +redisvl==0.0.7 # semantic caching \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 768e8dff3f..b0a49553d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching +redisvl==0.0.7 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From d0bc5f984c82e6770fe4891a7b000427529b926e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:31:57 -0800 Subject: [PATCH 096/148] (feat) log semantic_sim to langfuse --- litellm/caching.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index a7958d074c..133d1db6dd 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache): ) results = await self.index.aquery(query) if results == None: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None if isinstance(results, list): if len(results) == 0: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None vector_distance = results[0]["vector_distance"] @@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache): print_verbose( f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" ) + + # update kwargs["metadata"] with similarity, don't rewrite the original metadata + kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity + if similarity > self.similarity_threshold: # cache hit ! cached_value = results[0]["response"] @@ -968,7 +974,7 @@ class Cache: "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) cached_result = await self.cache.async_get_cache( - cache_key, messages=messages + cache_key, *args, **kwargs ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age From d5b500c0f17dee2c8b54b814458d686440bbdf7c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:22:02 -0800 Subject: [PATCH 097/148] allow setting redis_semantic cache_embedding model --- litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 133d1db6dd..6bf53ea451 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache): redis_url=None, similarity_threshold=None, use_async=False, + embedding_model="text-embedding-ada-002", **kwargs, ): from redisvl.index import SearchIndex @@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache): if similarity_threshold is None: raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold + self.embedding_model = embedding_model schema = { "index": { "name": "litellm_semantic_cache_index", @@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache): # create an embedding for prompt embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache): # convert to embedding embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list try: await self.index.acreate(overwrite=False) # don't overwrite existing index @@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] # create an embedding for prompt - - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache): print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list # query @@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] - # convert to embedding - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -727,6 +755,7 @@ class Cache: s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, redis_semantic_cache_use_async=False, + redis_semantic_cache_embedding_model="text-embedding-ada-002", **kwargs, ): """ @@ -757,6 +786,7 @@ class Cache: password, similarity_threshold=similarity_threshold, use_async=redis_semantic_cache_use_async, + embedding_model=redis_semantic_cache_embedding_model, **kwargs, ) elif type == "local": From 157c8d05429a74c482f1a4942210ab91dab26440 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:27:33 -0800 Subject: [PATCH 098/148] (fix) use semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 41c3b41828..326544f41e 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -77,7 +77,7 @@ litellm_settings: cache_params: type: "redis-semantic" similarity_threshold: 0.8 - redis_semantic_cache_use_async: True + redis_semantic_cache_embedding_model: azure-embedding-model # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 51e0dd3471db39c694d4bf37190a889fe68e16ec Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:32:07 -0800 Subject: [PATCH 099/148] (docs) using semantic caching on proxy --- docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 03bb9fed34..3f26878241 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -9,7 +9,7 @@ LiteLLM supports: - Redis Cache - s3 Bucket Cache -## Quick Start - Redis, s3 Cache +## Quick Start - Redis, s3 Cache, Semantic Cache @@ -84,6 +84,56 @@ litellm_settings: $ litellm --config /path/to/config.yaml ``` + + + + +Caching can be enabled by adding the `cache` key in the `config.yaml` + +### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: azure-embedding-model + litellm_params: + model: azure/azure-embedding-model + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 # similarity threshold for semantic cache + redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list +``` + +### Step 2: Add Redis Credentials to .env +Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching. + + ```shell + REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' + ## OR ## + REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' + REDIS_PORT = "" # REDIS_PORT='18841' + REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' + ``` + +**Additional kwargs** +You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +```shell +REDIS_ = "" +``` + +### Step 3: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + From 48be4a2695e48ced9277db509723108d71d74941 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 100/148] (feat) redis-semantic cache on proxy --- litellm/proxy/proxy_server.py | 5 ++++- requirements.txt | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index bd5b43f5f5..046bc71b05 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1168,7 +1168,7 @@ class ProxyConfig: verbose_proxy_logger.debug(f"passed cache type={cache_type}") - if cache_type == "redis": + if cache_type == "redis" or cache_type == "redis-semantic": cache_host = litellm.get_secret("REDIS_HOST", None) cache_port = litellm.get_secret("REDIS_PORT", None) cache_password = litellm.get_secret("REDIS_PASSWORD", None) @@ -1195,6 +1195,9 @@ class ProxyConfig: f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}" ) print() # noqa + if cache_type == "redis-semantic": + # by default this should always be async + cache_params.update({"redis_semantic_cache_use_async": True}) # users can pass os.environ/ variables on the proxy - we should read them from the env for key, value in cache_params.items(): diff --git a/requirements.txt b/requirements.txt index b0a49553d1..3ace5872ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From 7df0a10361e13702e029a3980361180af45abc9b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:39:44 -0800 Subject: [PATCH 101/148] (fix) test-semantic caching --- litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index a1a42ff659..cc18dda165 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion(): ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" - # assert response1.choices[0].message == 1 + random_number = random.randint(1, 100000) + + response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response2: {response1}") + assert response1.id == response2.id # test_redis_cache_completion() @@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion(): "content": f"write a one sentence poem about: {random_number}", } ], - max_tokens=20, + max_tokens=5, ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" + random_number = random.randint(1, 100000) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=5, + ) + print(f"response2: {response2}") + assert response1.id == response2.id From 014a7833418fbd02590bf7d4f9a83bddb8476690 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:53:28 -0800 Subject: [PATCH 102/148] (docs) redis cache --- docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 8a580f087c..7b21d35b6c 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -1,11 +1,11 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Caching - In-Memory, Redis, s3 +# Caching - In-Memory, Redis, s3, Redis Semantic Cache [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py) -## Initialize Cache - In Memory, Redis, s3 Bucket +## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache @@ -18,7 +18,7 @@ pip install redis ``` For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ -### Quick Start + ```python import litellm from litellm import completion @@ -55,7 +55,7 @@ Set AWS environment variables AWS_ACCESS_KEY_ID = "AKI*******" AWS_SECRET_ACCESS_KEY = "WOl*****" ``` -### Quick Start + ```python import litellm from litellm import completion @@ -80,6 +80,66 @@ response2 = completion( + + +Install redis +```shell +pip install redisvl==0.0.7 +``` + +For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ + +```python +import litellm +from litellm import completion +from litellm.caching import Cache + +random_number = random.randint( + 1, 100000 +) # add a random number to ensure it's always adding / reading from cache + +print("testing semantic caching") +litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here +) +response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response1: {response1}") + +random_number = random.randint(1, 100000) + +response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response2: {response1}") +assert response1.id == response2.id +# response1 == response2, response 1 is cached +``` + + + + + ### Quick Start From 71cb2af495bfab66075e172c576f8ea274014c04 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:54:55 -0800 Subject: [PATCH 103/148] (docs) litellm semantic caching --- docs/my-website/docs/caching/redis_cache.md | 2 +- docs/my-website/docs/proxy/caching.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 7b21d35b6c..75e1db9557 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -104,7 +104,7 @@ litellm.cache = Cache( host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.8, + similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here ) response1 = completion( diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 3f26878241..d5b589e5c2 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -7,6 +7,7 @@ Cache LLM Responses LiteLLM supports: - In Memory Cache - Redis Cache +- Redis Semantic Cache - s3 Bucket Cache ## Quick Start - Redis, s3 Cache, Semantic Cache From 0162a3e9f4afb1cf0076202c9cee76c28362165f Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:55:15 -0800 Subject: [PATCH 104/148] (fix) semantic caching --- litellm/tests/test_caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index cc18dda165..96fd8eb9d2 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion(): port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", ) response1 = completion( model="gpt-3.5-turbo", From f727f987d2120ea8c1465dde47eb90357da0cf50 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:04:19 -0800 Subject: [PATCH 105/148] (fix) mark semantic caching as beta test --- litellm/tests/test_caching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 96fd8eb9d2..6cb5b974a1 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -989,6 +989,7 @@ def test_cache_context_managers(): # test_cache_context_managers() +@pytest.mark.skip(reason="beta test - new redis semantic cache") def test_redis_semantic_cache_completion(): litellm.set_verbose = True import logging @@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() +@pytest.mark.skip(reason="beta test - new redis semantic cache") @pytest.mark.asyncio async def test_redis_semantic_cache_acompletion(): litellm.set_verbose = True From 23c684496e418d07cfb515f6a1457f8f1ac8fab2 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:26:48 -0800 Subject: [PATCH 106/148] (ci/cd) run again --- litellm/tests/test_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 6cb5b974a1..8433941e90 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 - ) # add a random number to ensure it's always adding / reading from cache + ) # add a random number to ensure it's always adding /reading from cache print("testing semantic caching") litellm.cache = Cache( From 01c46ce192fc3bf3b5190d312b94e9937e793a89 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:29:31 -0800 Subject: [PATCH 107/148] test(test_completion.py): fix test --- docs/my-website/docs/proxy/caching.md | 7 ++++--- litellm/tests/test_completion.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index d5b589e5c2..2b385de8e5 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -211,9 +211,10 @@ litellm_settings: The proxy support 3 cache-controls: -- `ttl`: Will cache the response for the user-defined amount of time (in seconds). -- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds). -- `no-cache`: Will not return a cached response, but instead call the actual endpoint. +- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds). +- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. +- `no-store`: *Optional(bool)* Will not cache the response. [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index de79c97afa..b075e48190 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -565,7 +565,6 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None - raise Exception("it works!") except Timeout as e: pass except Exception as e: From a13a45896a87ed9018c4fa2e5b14d4355d427cc0 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:09:48 -0800 Subject: [PATCH 108/148] (feat) show langfuse logging tags better through proxy --- litellm/integrations/langfuse.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index 82de333660..3c3e793dfb 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -252,8 +252,14 @@ class LangFuseLogger: print_verbose(f"trace: {cost}") if supports_tags: for key, value in metadata.items(): - tags.append(f"{key}:{value}") + if key in [ + "user_api_key", + "user_api_key_user_id", + ]: + tags.append(f"{key}:{value}") if "cache_hit" in kwargs: + if kwargs["cache_hit"] is None: + kwargs["cache_hit"] = False tags.append(f"cache_hit:{kwargs['cache_hit']}") trace_params.update({"tags": tags}) From bac5b40248b5af0441d44bda89b166325e05cd27 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 12:28:21 -0800 Subject: [PATCH 109/148] (feat )add semantic cache --- litellm/caching.py | 102 +++++++++++++++++++++++++++++++++- litellm/tests/test_caching.py | 25 +++++++++ 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index d0721fe9a9..e1ef95dc34 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -83,7 +83,6 @@ class InMemoryCache(BaseCache): self.cache_dict.clear() self.ttl_dict.clear() - async def disconnect(self): pass @@ -217,7 +216,6 @@ class RedisCache(BaseCache): def flush_cache(self): self.redis_client.flushall() - async def disconnect(self): pass @@ -225,6 +223,102 @@ class RedisCache(BaseCache): self.redis_client.delete(key) +class RedisSemanticCache(RedisCache): + def __init__(self, host, port, password, **kwargs): + super().__init__() + + # from redis.commands.search.field import TagField, TextField, NumericField, VectorField + # from redis.commands.search.indexDefinition import IndexDefinition, IndexType + # from redis.commands.search.query import Query + + # INDEX_NAME = 'idx:litellm_completion_response_vss' + # DOC_PREFIX = 'bikes:' + + # try: + # # check to see if index exists + # client.ft(INDEX_NAME).info() + # print('Index already exists!') + # except: + # # schema + # schema = ( + # TextField('$.model', no_stem=True, as_name='model'), + # TextField('$.brand', no_stem=True, as_name='brand'), + # NumericField('$.price', as_name='price'), + # TagField('$.type', as_name='type'), + # TextField('$.description', as_name='description'), + # VectorField('$.description_embeddings', + # 'FLAT', { + # 'TYPE': 'FLOAT32', + # 'DIM': VECTOR_DIMENSION, + # 'DISTANCE_METRIC': 'COSINE', + # }, as_name='vector' + # ), + # ) + + # # index Definition + # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) + + # # create Index + # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + + def set_cache(self, key, value, **kwargs): + ttl = kwargs.get("ttl", None) + print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") + try: + # get text response + # print("in redis semantic cache: value: ", value) + llm_response = value["response"] + + # if llm_response is a string, convert it to a dictionary + if isinstance(llm_response, str): + llm_response = json.loads(llm_response) + + # print("converted llm_response: ", llm_response) + response = llm_response["choices"][0]["message"]["content"] + + # create embedding response + + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=response, + cache={"no-store": True}, + ) + + raw_embedding = embedding_response["data"][0]["embedding"] + raw_embedding_dimension = len(raw_embedding) + + # print("embedding: ", raw_embedding) + key = "litellm-semantic:" + key + self.redis_client.json().set( + name=key, + path="$", + obj=json.dumps( + { + "response": response, + "embedding": raw_embedding, + "dimension": raw_embedding_dimension, + } + ), + ) + + stored_redis_value = self.redis_client.json().get(name=key) + + # print("Stored Redis Value: ", stored_redis_value) + + except Exception as e: + # print("Error occurred: ", e) + # NON blocking - notify users Redis is throwing an exception + logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + + def get_cache(self, key, **kwargs): + pass + + async def async_set_cache(self, key, value, **kwargs): + pass + + async def async_get_cache(self, key, **kwargs): + pass + class S3Cache(BaseCache): def __init__( @@ -429,7 +523,7 @@ class DualCache(BaseCache): class Cache: def __init__( self, - type: Optional[Literal["local", "redis", "s3"]] = "local", + type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local", host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, @@ -468,6 +562,8 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) + elif type == "redis-semantic": + self.cache = RedisSemanticCache(host, port, password, **kwargs) elif type == "local": self.cache = InMemoryCache() elif type == "s3": diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 468ab6f80f..32904ab784 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -987,3 +987,28 @@ def test_cache_context_managers(): # test_cache_context_managers() + + +def test_redis_semantic_cache_completion(): + litellm.set_verbose = False + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + messages = [ + {"role": "user", "content": f"write a one sentence poem about: {random_number}"} + ] + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + ) + print("test2 for Redis Caching - non streaming") + response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) + # response2 = completion( + # model="gpt-3.5-turbo", messages=messages,max_tokens=20 + # ) + + +# test_redis_cache_completion() From 8c49cf0bbecbb31b7ecea4b057e948243c617a2a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:12 -0800 Subject: [PATCH 110/148] (feat) working - sync semantic caching --- litellm/caching.py | 227 ++++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 75 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index e1ef95dc34..0a1046f0d8 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -223,94 +223,161 @@ class RedisCache(BaseCache): self.redis_client.delete(key) -class RedisSemanticCache(RedisCache): - def __init__(self, host, port, password, **kwargs): - super().__init__() +class RedisSemanticCache(BaseCache): + def __init__( + self, + host=None, + port=None, + password=None, + redis_url=None, + similarity_threshold=None, + **kwargs, + ): + from redisvl.index import SearchIndex + from redisvl.query import VectorQuery - # from redis.commands.search.field import TagField, TextField, NumericField, VectorField - # from redis.commands.search.indexDefinition import IndexDefinition, IndexType - # from redis.commands.search.query import Query + print_verbose( + "redis semantic-cache initializing INDEX - litellm_semantic_cache_index" + ) + if similarity_threshold is None: + raise Exception("similarity_threshold must be provided, passed None") + self.similarity_threshold = similarity_threshold + schema = { + "index": { + "name": "litellm_semantic_cache_index", + "prefix": "litellm", + "storage_type": "hash", + }, + "fields": { + "text": [{"name": "response"}], + "text": [{"name": "prompt"}], + "vector": [ + { + "name": "litellm_embedding", + "dims": 1536, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + } + ], + }, + } + self.index = SearchIndex.from_dict(schema) + if redis_url is None: + # if no url passed, check if host, port and password are passed, if not raise an Exception + if host is None or port is None or password is None: + raise Exception(f"Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port + print_verbose(f"redis semantic-cache redis_url: {redis_url}") + self.index.connect(redis_url=redis_url) + self.index.create(overwrite=False) # don't overwrite existing index - # INDEX_NAME = 'idx:litellm_completion_response_vss' - # DOC_PREFIX = 'bikes:' + def _get_cache_logic(self, cached_response: Any): + """ + Common 'get_cache_logic' across sync + async redis client implementations + """ + if cached_response is None: + return cached_response - # try: - # # check to see if index exists - # client.ft(INDEX_NAME).info() - # print('Index already exists!') - # except: - # # schema - # schema = ( - # TextField('$.model', no_stem=True, as_name='model'), - # TextField('$.brand', no_stem=True, as_name='brand'), - # NumericField('$.price', as_name='price'), - # TagField('$.type', as_name='type'), - # TextField('$.description', as_name='description'), - # VectorField('$.description_embeddings', - # 'FLAT', { - # 'TYPE': 'FLOAT32', - # 'DIM': VECTOR_DIMENSION, - # 'DISTANCE_METRIC': 'COSINE', - # }, as_name='vector' - # ), - # ) + # check if cached_response is bytes + if isinstance(cached_response, bytes): + cached_response = cached_response.decode("utf-8") - # # index Definition - # definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON) - - # # create Index - # client.ft(INDEX_NAME).create_index(fields=schema, definition=definition) + try: + cached_response = json.loads( + cached_response + ) # Convert string to dictionary + except: + cached_response = ast.literal_eval(cached_response) + return cached_response def set_cache(self, key, value, **kwargs): - ttl = kwargs.get("ttl", None) - print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}") - try: - # get text response - # print("in redis semantic cache: value: ", value) - llm_response = value["response"] + import numpy as np - # if llm_response is a string, convert it to a dictionary - if isinstance(llm_response, str): - llm_response = json.loads(llm_response) + print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}") - # print("converted llm_response: ", llm_response) - response = llm_response["choices"][0]["message"]["content"] + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] - # create embedding response + # create an embedding for prompt + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) - embedding_response = litellm.embedding( - model="text-embedding-ada-002", - input=response, - cache={"no-store": True}, - ) + # get the embedding + embedding = embedding_response["data"][0]["embedding"] - raw_embedding = embedding_response["data"][0]["embedding"] - raw_embedding_dimension = len(raw_embedding) + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) - # print("embedding: ", raw_embedding) - key = "litellm-semantic:" + key - self.redis_client.json().set( - name=key, - path="$", - obj=json.dumps( - { - "response": response, - "embedding": raw_embedding, - "dimension": raw_embedding_dimension, - } - ), - ) + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] - stored_redis_value = self.redis_client.json().get(name=key) + # Add more data + keys = self.index.load(new_data) - # print("Stored Redis Value: ", stored_redis_value) - - except Exception as e: - # print("Error occurred: ", e) - # NON blocking - notify users Redis is throwing an exception - logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) + pass def get_cache(self, key, **kwargs): + print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + num_results=1, + ) + + results = self.index.query(query) + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None + pass async def async_set_cache(self, key, value, **kwargs): @@ -527,6 +594,7 @@ class Cache: host: Optional[str] = None, port: Optional[str] = None, password: Optional[str] = None, + similarity_threshold: Optional[float] = None, supported_call_types: Optional[ List[Literal["completion", "acompletion", "embedding", "aembedding"]] ] = ["completion", "acompletion", "embedding", "aembedding"], @@ -547,10 +615,12 @@ class Cache: Initializes the cache based on the given type. Args: - type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local". + type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local". host (str, optional): The host address for the Redis cache. Required if type is "redis". port (int, optional): The port number for the Redis cache. Required if type is "redis". password (str, optional): The password for the Redis cache. Required if type is "redis". + similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic" + supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types. **kwargs: Additional keyword arguments for redis.Redis() cache @@ -563,7 +633,13 @@ class Cache: if type == "redis": self.cache: BaseCache = RedisCache(host, port, password, **kwargs) elif type == "redis-semantic": - self.cache = RedisSemanticCache(host, port, password, **kwargs) + self.cache = RedisSemanticCache( + host, + port, + password, + similarity_threshold=similarity_threshold, + **kwargs, + ) elif type == "local": self.cache = InMemoryCache() elif type == "s3": @@ -743,6 +819,7 @@ class Cache: The cached result if it exists, otherwise None. """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -752,7 +829,7 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = self.cache.get_cache(cache_key) + cached_result = self.cache.get_cache(cache_key, messages=messages) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From ff70e0ea66ad2df9ab773752f5ab40b90f490531 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 17:58:32 -0800 Subject: [PATCH 111/148] (test) semantic cache --- litellm/tests/test_caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 32904ab784..3ac812cf35 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -990,7 +990,7 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): - litellm.set_verbose = False + litellm.set_verbose = True random_number = random.randint( 1, 100000 @@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion(): host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.5, ) print("test2 for Redis Caching - non streaming") response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) From 2adf88df240c5efe1a8dc05c586a206a07019987 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:22:50 -0800 Subject: [PATCH 112/148] (test) semantic caching --- litellm/tests/test_caching.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 3ac812cf35..4b47614cca 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 ) # add a random number to ensure it's always adding / reading from cache - messages = [ - {"role": "user", "content": f"write a one sentence poem about: {random_number}"} - ] + + print("testing semantic caching") litellm.cache = Cache( type="redis-semantic", host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.5, + similarity_threshold=0.8, ) - print("test2 for Redis Caching - non streaming") - response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20) - # response2 = completion( - # model="gpt-3.5-turbo", messages=messages,max_tokens=20 - # ) + response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" + # assert response1.choices[0].message == 1 # test_redis_cache_completion() From 2084d0e0403dc6119a1348bdf8a106b64624df58 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 5 Feb 2024 18:25:22 -0800 Subject: [PATCH 113/148] (fix) semantic cache --- litellm/caching.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index 0a1046f0d8..877f935fab 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache): redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") self.index.connect(redis_url=redis_url) - self.index.create(overwrite=False) # don't overwrite existing index + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") def _get_cache_logic(self, cached_response: Any): """ From bc450487d07e1a959449c15157fe13950822b8a6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:13:12 -0800 Subject: [PATCH 114/148] (feat) RedisSemanticCache - async --- litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 877f935fab..ad37f2077c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache): password=None, redis_url=None, similarity_threshold=None, + use_async=False, **kwargs, ): from redisvl.index import SearchIndex @@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache): ], }, } - self.index = SearchIndex.from_dict(schema) if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: raise Exception(f"Redis host, port, and password must be provided") redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") - self.index.connect(redis_url=redis_url) + if use_async == False: + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url) + elif use_async == True: + schema["index"]["name"] = "litellm_semantic_cache_index_async" + self.index = SearchIndex.from_dict(schema) + self.index.connect(redis_url=redis_url, use_async=True) try: self.index.create(overwrite=False) # don't overwrite existing index except Exception as e: @@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache): # Add more data keys = self.index.load(new_data) - pass + return def get_cache(self, key, **kwargs): - print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}") + print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np @@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache): ) results = self.index.query(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None vector_distance = results[0]["vector_distance"] vector_distance = float(vector_distance) @@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache): pass async def async_set_cache(self, key, value, **kwargs): - pass + import numpy as np + + print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") + + # get the prompt + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + # create an embedding for prompt + + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + # make the embedding a numpy array, convert to bytes + embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() + value = str(value) + assert isinstance(value, str) + + new_data = [ + {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} + ] + + # Add more data + keys = await self.index.aload(new_data) + return async def async_get_cache(self, key, **kwargs): + print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") + from redisvl.query import VectorQuery + import numpy as np + + # query + + # get the messages + messages = kwargs["messages"] + prompt = "" + for message in messages: + prompt += message["content"] + + # convert to embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + + # get the embedding + embedding = embedding_response["data"][0]["embedding"] + + query = VectorQuery( + vector=embedding, + vector_field_name="litellm_embedding", + return_fields=["response", "prompt", "vector_distance"], + ) + results = await self.index.aquery(query) + if results == None: + return None + if isinstance(results, list): + if len(results) == 0: + return None + + vector_distance = results[0]["vector_distance"] + vector_distance = float(vector_distance) + similarity = 1 - vector_distance + cached_prompt = results[0]["prompt"] + + # check similarity, if more than self.similarity_threshold, return results + print_verbose( + f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" + ) + if similarity > self.similarity_threshold: + # cache hit ! + cached_value = results[0]["response"] + print_verbose( + f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}" + ) + return self._get_cache_logic(cached_response=cached_value) + else: + # cache miss ! + return None pass @@ -612,6 +707,7 @@ class Cache: s3_aws_secret_access_key: Optional[str] = None, s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, + redis_semantic_cache_use_async=False, **kwargs, ): """ @@ -641,6 +737,7 @@ class Cache: port, password, similarity_threshold=similarity_threshold, + use_async=redis_semantic_cache_use_async, **kwargs, ) elif type == "local": @@ -847,6 +944,7 @@ class Cache: Used for embedding calls in async wrapper """ try: # never block execution + messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: @@ -856,7 +954,9 @@ class Cache: max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = await self.cache.async_get_cache(cache_key) + cached_result = await self.cache.async_get_cache( + cache_key, messages=messages + ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) From f706b42926d1ce8933fbfe46b74a99bee4d88bff Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:14:54 -0800 Subject: [PATCH 115/148] (test) async semantic cache --- litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 4b47614cca..a1a42ff659 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -991,6 +991,9 @@ def test_cache_context_managers(): def test_redis_semantic_cache_completion(): litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) random_number = random.randint( 1, 100000 @@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() + + +@pytest.mark.asyncio +async def test_redis_semantic_cache_acompletion(): + litellm.set_verbose = True + import logging + + logging.basicConfig(level=logging.DEBUG) + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + + print("testing semantic caching") + litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_use_async=True, + ) + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response1: {response1}") + + assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" From 7166d63d8700ed203cb43d33b793434c3ad50806 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:52:57 -0800 Subject: [PATCH 116/148] (feat) working semantic-cache on litellm proxy --- litellm/caching.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index ad37f2077c..a7958d074c 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache): if redis_url is None: # if no url passed, check if host, port and password are passed, if not raise an Exception if host is None or port is None or password is None: - raise Exception(f"Redis host, port, and password must be provided") + # try checking env for host, port and password + import os + + host = os.getenv("REDIS_HOST") + port = os.getenv("REDIS_PORT") + password = os.getenv("REDIS_PASSWORD") + if host is None or port is None or password is None: + raise Exception("Redis host, port, and password must be provided") + redis_url = "redis://:" + password + "@" + host + ":" + port print_verbose(f"redis semantic-cache redis_url: {redis_url}") if use_async == False: self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url) + try: + self.index.create(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") elif use_async == True: schema["index"]["name"] = "litellm_semantic_cache_index_async" self.index = SearchIndex.from_dict(schema) self.index.connect(redis_url=redis_url, use_async=True) - try: - self.index.create(overwrite=False) # don't overwrite existing index - except Exception as e: - print_verbose(f"Got exception creating semantic cache index: {str(e)}") + # def _get_cache_logic(self, cached_response: Any): """ Common 'get_cache_logic' across sync + async redis client implementations @@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + try: + await self.index.acreate(overwrite=False) # don't overwrite existing index + except Exception as e: + print_verbose(f"Got exception creating semantic cache index: {str(e)}") print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}") # get the prompt From e2ccdb7a1b22ba804f0d40968baaa49569084412 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:54:36 -0800 Subject: [PATCH 117/148] (feat) redis-semantic cache --- litellm/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 62315b3d97..b37c68d655 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger from .proxy._types import KeyManagementSystem from openai import OpenAIError as OriginalError from openai._models import BaseModel as OpenAIObject -from .caching import S3Cache +from .caching import S3Cache, RedisSemanticCache from .exceptions import ( AuthenticationError, BadRequestError, @@ -2535,6 +2535,14 @@ def client(original_function): ): if len(cached_result) == 1 and cached_result[0] is None: cached_result = None + elif isinstance(litellm.cache.cache, RedisSemanticCache): + preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) + kwargs[ + "preset_cache_key" + ] = preset_cache_key # for streaming calls, we need to pass the preset_cache_key + cached_result = await litellm.cache.async_get_cache( + *args, **kwargs + ) else: preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) kwargs[ From e8fbbd0722f0a8f660725e9d97a22975525e4a91 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:55:25 -0800 Subject: [PATCH 118/148] (feat) working semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index bd844bd7ba..41c3b41828 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -73,10 +73,12 @@ litellm_settings: max_budget: 1.5000 models: ["azure-gpt-3.5"] duration: None - upperbound_key_generate_params: - max_budget: 100 - duration: "30d" - # cache: True + cache: True # set cache responses to True + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 + redis_semantic_cache_use_async: True + # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 6b83d0219e720d1be129c64f952f72c7a8262352 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 119/148] (fix) add redisvl==0.0.7 --- .circleci/requirements.txt | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index 85b576bff2..4730fc28b1 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -10,4 +10,5 @@ anthropic boto3 orjson pydantic -google-cloud-aiplatform \ No newline at end of file +google-cloud-aiplatform +redisvl==0.0.7 # semantic caching \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 768e8dff3f..b0a49553d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching +redisvl==0.0.7 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From a900b0128be71b3f6b71f8366b661096a083ca84 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:31:57 -0800 Subject: [PATCH 120/148] (feat) log semantic_sim to langfuse --- litellm/caching.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/caching.py b/litellm/caching.py index a7958d074c..133d1db6dd 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache): ) results = await self.index.aquery(query) if results == None: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None if isinstance(results, list): if len(results) == 0: + kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 return None vector_distance = results[0]["vector_distance"] @@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache): print_verbose( f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" ) + + # update kwargs["metadata"] with similarity, don't rewrite the original metadata + kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity + if similarity > self.similarity_threshold: # cache hit ! cached_value = results[0]["response"] @@ -968,7 +974,7 @@ class Cache: "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) cached_result = await self.cache.async_get_cache( - cache_key, messages=messages + cache_key, *args, **kwargs ) return self._get_cache_logic( cached_result=cached_result, max_age=max_age From 1ac003e8fe11983f87e02c791938acb60861fcf6 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:22:02 -0800 Subject: [PATCH 121/148] allow setting redis_semantic cache_embedding model --- litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/litellm/caching.py b/litellm/caching.py index 133d1db6dd..6bf53ea451 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache): redis_url=None, similarity_threshold=None, use_async=False, + embedding_model="text-embedding-ada-002", **kwargs, ): from redisvl.index import SearchIndex @@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache): if similarity_threshold is None: raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold + self.embedding_model = embedding_model schema = { "index": { "name": "litellm_semantic_cache_index", @@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache): # create an embedding for prompt embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache): # convert to embedding embedding_response = litellm.embedding( - model="text-embedding-ada-002", + model=self.embedding_model, input=prompt, cache={"no-store": True, "no-cache": True}, ) @@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list try: await self.index.acreate(overwrite=False) # don't overwrite existing index @@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] # create an embedding for prompt - - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache): print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}") from redisvl.query import VectorQuery import numpy as np + from litellm.proxy.proxy_server import llm_router, llm_model_list # query @@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache): for message in messages: prompt += message["content"] - # convert to embedding - embedding_response = await litellm.aembedding( - model="text-embedding-ada-002", - input=prompt, - cache={"no-store": True, "no-cache": True}, + router_model_names = ( + [m["model_name"] for m in llm_model_list] + if llm_model_list is not None + else [] ) + if llm_router is not None and self.embedding_model in router_model_names: + embedding_response = await llm_router.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) + else: + # convert to embedding + embedding_response = await litellm.aembedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ) # get the embedding embedding = embedding_response["data"][0]["embedding"] @@ -727,6 +755,7 @@ class Cache: s3_aws_session_token: Optional[str] = None, s3_config: Optional[Any] = None, redis_semantic_cache_use_async=False, + redis_semantic_cache_embedding_model="text-embedding-ada-002", **kwargs, ): """ @@ -757,6 +786,7 @@ class Cache: password, similarity_threshold=similarity_threshold, use_async=redis_semantic_cache_use_async, + embedding_model=redis_semantic_cache_embedding_model, **kwargs, ) elif type == "local": From d92f2d777f813f00208dbbd6c7b45b6f8e1823e7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:27:33 -0800 Subject: [PATCH 122/148] (fix) use semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 41c3b41828..326544f41e 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -77,7 +77,7 @@ litellm_settings: cache_params: type: "redis-semantic" similarity_threshold: 0.8 - redis_semantic_cache_use_async: True + redis_semantic_cache_embedding_model: azure-embedding-model # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 0e7fe751a556b5d3ac2ec2df28b744a6241b1a48 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:32:07 -0800 Subject: [PATCH 123/148] (docs) using semantic caching on proxy --- docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 03bb9fed34..3f26878241 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -9,7 +9,7 @@ LiteLLM supports: - Redis Cache - s3 Bucket Cache -## Quick Start - Redis, s3 Cache +## Quick Start - Redis, s3 Cache, Semantic Cache @@ -84,6 +84,56 @@ litellm_settings: $ litellm --config /path/to/config.yaml ``` + + + + +Caching can be enabled by adding the `cache` key in the `config.yaml` + +### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: azure-embedding-model + litellm_params: + model: azure/azure-embedding-model + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: + type: "redis-semantic" + similarity_threshold: 0.8 # similarity threshold for semantic cache + redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list +``` + +### Step 2: Add Redis Credentials to .env +Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching. + + ```shell + REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' + ## OR ## + REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' + REDIS_PORT = "" # REDIS_PORT='18841' + REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' + ``` + +**Additional kwargs** +You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +```shell +REDIS_ = "" +``` + +### Step 3: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + From f8472fe3cfd0ead1aa3c302bd37a16f18d353d62 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 124/148] (feat) redis-semantic cache on proxy --- litellm/proxy/proxy_server.py | 5 ++++- requirements.txt | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 5c336ea91e..30233fc137 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1168,7 +1168,7 @@ class ProxyConfig: verbose_proxy_logger.debug(f"passed cache type={cache_type}") - if cache_type == "redis": + if cache_type == "redis" or cache_type == "redis-semantic": cache_host = litellm.get_secret("REDIS_HOST", None) cache_port = litellm.get_secret("REDIS_PORT", None) cache_password = litellm.get_secret("REDIS_PASSWORD", None) @@ -1195,6 +1195,9 @@ class ProxyConfig: f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}" ) print() # noqa + if cache_type == "redis-semantic": + # by default this should always be async + cache_params.update({"redis_semantic_cache_use_async": True}) # users can pass os.environ/ variables on the proxy - we should read them from the env for key, value in cache_params.items(): diff --git a/requirements.txt b/requirements.txt index b0a49553d1..3ace5872ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From d6a76da74ebdf57450e8538fe5cd87c6ed9afa20 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:39:44 -0800 Subject: [PATCH 125/148] (fix) test-semantic caching --- litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index a1a42ff659..cc18dda165 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion(): ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ" - # assert response1.choices[0].message == 1 + random_number = random.randint(1, 100000) + + response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, + ) + print(f"response2: {response1}") + assert response1.id == response2.id # test_redis_cache_completion() @@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion(): "content": f"write a one sentence poem about: {random_number}", } ], - max_tokens=20, + max_tokens=5, ) print(f"response1: {response1}") - assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5" + random_number = random.randint(1, 100000) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=5, + ) + print(f"response2: {response2}") + assert response1.id == response2.id From e96c494c5b20b36c885ae68f27ba8a38cf128e2c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:53:28 -0800 Subject: [PATCH 126/148] (docs) redis cache --- docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 8a580f087c..7b21d35b6c 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -1,11 +1,11 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Caching - In-Memory, Redis, s3 +# Caching - In-Memory, Redis, s3, Redis Semantic Cache [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py) -## Initialize Cache - In Memory, Redis, s3 Bucket +## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache @@ -18,7 +18,7 @@ pip install redis ``` For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ -### Quick Start + ```python import litellm from litellm import completion @@ -55,7 +55,7 @@ Set AWS environment variables AWS_ACCESS_KEY_ID = "AKI*******" AWS_SECRET_ACCESS_KEY = "WOl*****" ``` -### Quick Start + ```python import litellm from litellm import completion @@ -80,6 +80,66 @@ response2 = completion( + + +Install redis +```shell +pip install redisvl==0.0.7 +``` + +For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ + +```python +import litellm +from litellm import completion +from litellm.caching import Cache + +random_number = random.randint( + 1, 100000 +) # add a random number to ensure it's always adding / reading from cache + +print("testing semantic caching") +litellm.cache = Cache( + type="redis-semantic", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here +) +response1 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response1: {response1}") + +random_number = random.randint(1, 100000) + +response2 = completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ], + max_tokens=20, +) +print(f"response2: {response1}") +assert response1.id == response2.id +# response1 == response2, response 1 is cached +``` + + + + + ### Quick Start From 842b0fd9cc72801beb088e0a332a60b7dee77911 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:54:55 -0800 Subject: [PATCH 127/148] (docs) litellm semantic caching --- docs/my-website/docs/caching/redis_cache.md | 2 +- docs/my-website/docs/proxy/caching.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index 7b21d35b6c..75e1db9557 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -104,7 +104,7 @@ litellm.cache = Cache( host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], - similarity_threshold=0.8, + similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here ) response1 = completion( diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 3f26878241..d5b589e5c2 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -7,6 +7,7 @@ Cache LLM Responses LiteLLM supports: - In Memory Cache - Redis Cache +- Redis Semantic Cache - s3 Bucket Cache ## Quick Start - Redis, s3 Cache, Semantic Cache From 583cae96b2a4e10c256de7fcf3a99f707ce468dc Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:55:15 -0800 Subject: [PATCH 128/148] (fix) semantic caching --- litellm/tests/test_caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index cc18dda165..96fd8eb9d2 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion(): port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], similarity_threshold=0.8, + redis_semantic_cache_embedding_model="text-embedding-ada-002", ) response1 = completion( model="gpt-3.5-turbo", From bca633d3235791cd82570d081c17679508ed5886 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 11:04:19 -0800 Subject: [PATCH 129/148] (fix) mark semantic caching as beta test --- litellm/tests/test_caching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 96fd8eb9d2..6cb5b974a1 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -989,6 +989,7 @@ def test_cache_context_managers(): # test_cache_context_managers() +@pytest.mark.skip(reason="beta test - new redis semantic cache") def test_redis_semantic_cache_completion(): litellm.set_verbose = True import logging @@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion(): # test_redis_cache_completion() +@pytest.mark.skip(reason="beta test - new redis semantic cache") @pytest.mark.asyncio async def test_redis_semantic_cache_acompletion(): litellm.set_verbose = True From e564cf6869a517481fa9baaa72fd388e30bccc7a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:26:48 -0800 Subject: [PATCH 130/148] (ci/cd) run again --- litellm/tests/test_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 6cb5b974a1..8433941e90 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion(): random_number = random.randint( 1, 100000 - ) # add a random number to ensure it's always adding / reading from cache + ) # add a random number to ensure it's always adding /reading from cache print("testing semantic caching") litellm.cache = Cache( From 672ba8fb12fe9f48cb0f7015e9d071dddff91e91 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:29:31 -0800 Subject: [PATCH 131/148] test(test_completion.py): fix test --- docs/my-website/docs/proxy/caching.md | 7 ++++--- litellm/tests/test_completion.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index d5b589e5c2..2b385de8e5 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -211,9 +211,10 @@ litellm_settings: The proxy support 3 cache-controls: -- `ttl`: Will cache the response for the user-defined amount of time (in seconds). -- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds). -- `no-cache`: Will not return a cached response, but instead call the actual endpoint. +- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds). +- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. +- `no-store`: *Optional(bool)* Will not cache the response. [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index de79c97afa..b075e48190 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -565,7 +565,6 @@ def test_completion_openai(): assert len(response_str) > 1 litellm.api_key = None - raise Exception("it works!") except Timeout as e: pass except Exception as e: From ab4e7f2be99499aad1d77a06779dfbc7afc4f621 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:35:25 -0800 Subject: [PATCH 132/148] (feat) show semantic-cache on health/readiness --- litellm/caching.py | 3 +++ litellm/proxy/proxy_server.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/litellm/caching.py b/litellm/caching.py index 6bf53ea451..f996a58735 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -531,6 +531,9 @@ class RedisSemanticCache(BaseCache): return None pass + async def _index_info(self): + return await self.index.ainfo() + class S3Cache(BaseCache): def __init__( diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 661e932f37..427bb88a9c 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -4051,8 +4051,18 @@ async def health_readiness(): cache_type = None if litellm.cache is not None: + from litellm.caching import RedisSemanticCache + cache_type = litellm.cache.type + if isinstance(litellm.cache.cache, RedisSemanticCache): + # ping the cache + try: + index_info = await litellm.cache.cache._index_info() + except Exception as e: + index_info = "index does not exist - error: " + str(e) + cache_type = {"type": cache_type, "index_info": index_info} + if prisma_client is not None: # if db passed in, check if it's connected if prisma_client.db.is_connected() == True: response_object = {"db": "connected"} From d20abfb0b6af53030e459138bc3e126d632c1514 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:36:35 -0800 Subject: [PATCH 133/148] (fix) dockerfile requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ace5872ad..f2bff2680b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ pyyaml>=6.0.1 # server dep uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls -redis==4.6.0 # caching +redis==5.0.0 # caching redisvl==0.0.7 # semantic caching numpy==1.24.3 # semantic caching prisma==0.11.0 # for db From 83d5b2a8842caab3bc5e0fb34367b229d88d39e7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:41:22 -0800 Subject: [PATCH 134/148] (fix) langfuse show semantic-similarity in tags --- litellm/integrations/langfuse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index 3c3e793dfb..3031868ec7 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -255,6 +255,7 @@ class LangFuseLogger: if key in [ "user_api_key", "user_api_key_user_id", + "semantic-similarity", ]: tags.append(f"{key}:{value}") if "cache_hit" in kwargs: From ffa15a8a45cf36781ab4d24278a2ad25c48199d7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:52:32 -0800 Subject: [PATCH 135/148] (fix) redisvl requirements.txt issue --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f2bff2680b..55c5f14568 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==5.0.0 # caching -redisvl==0.0.7 # semantic caching numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions From 2381e025ade34c29ce039314abe2c5a49cc1fe80 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:55:51 -0800 Subject: [PATCH 136/148] refactor(main.py): trigger deploy n --- litellm/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/litellm/main.py b/litellm/main.py index 384dadc32d..b18221607f 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -10,7 +10,6 @@ import os, openai, sys, json, inspect, uuid, datetime, threading from typing import Any, Literal, Union from functools import partial - import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy import httpx From 71da1b1e6964e7a3fda14fd7d72a3dfd500b7bde Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 08:55:25 -0800 Subject: [PATCH 137/148] (feat) working semantic cache on proxy --- litellm/proxy/proxy_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 326544f41e..41c3b41828 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -77,7 +77,7 @@ litellm_settings: cache_params: type: "redis-semantic" similarity_threshold: 0.8 - redis_semantic_cache_embedding_model: azure-embedding-model + redis_semantic_cache_use_async: True # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 6b631a6c3a18acb5969338a7a5f02478d76af96c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 138/148] (fix) add redisvl==0.0.7 --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ace5872ad..b0a49553d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching -numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From a3fac3db12832c7da82dc4d9e7eb7c0361902dff Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 139/148] (feat) redis-semantic cache on proxy --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b0a49553d1..3ace5872ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From 9bf57170e72e252508bc2388d27dc7a881cfa31c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:35:25 -0800 Subject: [PATCH 140/148] (feat) show semantic-cache on health/readiness --- litellm/caching.py | 3 +++ litellm/proxy/proxy_server.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/litellm/caching.py b/litellm/caching.py index 6bf53ea451..f996a58735 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -531,6 +531,9 @@ class RedisSemanticCache(BaseCache): return None pass + async def _index_info(self): + return await self.index.ainfo() + class S3Cache(BaseCache): def __init__( diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 046bc71b05..7ac1521ba5 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -4051,8 +4051,18 @@ async def health_readiness(): cache_type = None if litellm.cache is not None: + from litellm.caching import RedisSemanticCache + cache_type = litellm.cache.type + if isinstance(litellm.cache.cache, RedisSemanticCache): + # ping the cache + try: + index_info = await litellm.cache.cache._index_info() + except Exception as e: + index_info = "index does not exist - error: " + str(e) + cache_type = {"type": cache_type, "index_info": index_info} + if prisma_client is not None: # if db passed in, check if it's connected await prisma_client.health_check() # test the db connection response_object = {"db": "connected"} From 17185976c42ed1ee2c7b2a100c291b2d6e2b5ed7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:36:35 -0800 Subject: [PATCH 141/148] (fix) dockerfile requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ace5872ad..f2bff2680b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ pyyaml>=6.0.1 # server dep uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls -redis==4.6.0 # caching +redis==5.0.0 # caching redisvl==0.0.7 # semantic caching numpy==1.24.3 # semantic caching prisma==0.11.0 # for db From 737111e620b6c0d6177e6b21d1f31a2138b8264a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:41:22 -0800 Subject: [PATCH 142/148] (fix) langfuse show semantic-similarity in tags --- litellm/integrations/langfuse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py index 3c3e793dfb..3031868ec7 100644 --- a/litellm/integrations/langfuse.py +++ b/litellm/integrations/langfuse.py @@ -255,6 +255,7 @@ class LangFuseLogger: if key in [ "user_api_key", "user_api_key_user_id", + "semantic-similarity", ]: tags.append(f"{key}:{value}") if "cache_hit" in kwargs: From 22a65638695dccb5727bf70965f374e26c7e434a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 13:52:32 -0800 Subject: [PATCH 143/148] (fix) redisvl requirements.txt issue --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f2bff2680b..55c5f14568 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==5.0.0 # caching -redisvl==0.0.7 # semantic caching numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions From 2888b11a06b6022ad59a151bb79834f44bc0b6d9 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 13:55:51 -0800 Subject: [PATCH 144/148] refactor(main.py): trigger deploy n --- litellm/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/litellm/main.py b/litellm/main.py index 384dadc32d..b18221607f 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -10,7 +10,6 @@ import os, openai, sys, json, inspect, uuid, datetime, threading from typing import Any, Literal, Union from functools import partial - import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy import httpx From 7e46156da2cdeda82179c39cb494b7a9e635501d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 09:30:45 -0800 Subject: [PATCH 145/148] (fix) add redisvl==0.0.7 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 55c5f14568..b0a49553d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,8 @@ pyyaml>=6.0.1 # server dep uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls -redis==5.0.0 # caching -numpy==1.24.3 # semantic caching +redis==4.6.0 # caching +redisvl==0.0.7 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From 540a6068d6f93f9d7e8181231f5f0428d62142b2 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 10:35:21 -0800 Subject: [PATCH 146/148] (feat) redis-semantic cache on proxy --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b0a49553d1..3ace5872ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching redisvl==0.0.7 # semantic caching +numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions google-generativeai==0.3.2 # for vertex ai calls From ad4b6be3ee9731642f02fceefb4b4c1d9e834d74 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 6 Feb 2024 14:00:27 -0800 Subject: [PATCH 147/148] (ci/cd) run again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index b075e48190..80a4372a57 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name(): messages=messages, logger_fn=logger_fn, ) - # Add any assertions here to check the, response + # Add any assertions here to check the,response print(response) print(response["choices"][0]["finish_reason"]) except litellm.Timeout as e: From 9ebcf3c94496435fce2f4154f3c70616a47b35d3 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 6 Feb 2024 14:10:56 -0800 Subject: [PATCH 148/148] build(requirements.txt): fix dependency --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ace5872ad..f78d766ee3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep gunicorn==21.2.0 # server dep boto3==1.28.58 # aws bedrock/sagemaker calls redis==4.6.0 # caching -redisvl==0.0.7 # semantic caching numpy==1.24.3 # semantic caching prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions