From 8cee267a5b46635e46df30d0c940cb4fbdc07d66 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 3 Jan 2024 12:42:30 +0530 Subject: [PATCH] fix(caching.py): support ttl, s-max-age, and no-cache cache controls https://github.com/BerriAI/litellm/issues/1306 --- docs/my-website/docs/proxy/caching.md | 150 +++++++++++++++----------- litellm/caching.py | 42 +++++++- litellm/main.py | 3 + litellm/tests/test_caching.py | 42 +++++++- litellm/utils.py | 21 ++-- 5 files changed, 182 insertions(+), 76 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index bb8399f1e..b33ed235d 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Caching Cache LLM Responses @@ -41,7 +44,13 @@ REDIS_ = "" $ litellm --config /path/to/config.yaml ``` + + ## Using Caching - /chat/completions + + + + Send the same request twice: ```shell curl http://0.0.0.0:8000/v1/chat/completions \ @@ -60,8 +69,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \ "temperature": 0.7 }' ``` + + -## Using Caching - /embeddings Send the same request twice: ```shell curl --location 'http://0.0.0.0:8000/embeddings' \ @@ -78,6 +88,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \ "input": ["write a litellm poem"] }' ``` + + ## Advanced ### Set Cache Params on config.yaml @@ -103,78 +115,86 @@ litellm_settings: supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types ``` -### Cache-Controls on requests +### Turn on / off caching per request. -Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`. +The proxy support 2 cache-controls: -Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218 +- `ttl`: Will cache the response for the user-defined amount of time (in seconds). +- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds). +- `no-cache`: Will not return a cached response, but instead call the actual endpoint. -```javascript -const { OpenAI } = require('openai'); +**Turn off caching** -const openai = new OpenAI({ - apiKey: "sk-1234", // This is the default and can be omitted - baseURL: "http://0.0.0.0:8000" -}); +```python +import os +from openai import OpenAI -async function main() { - const chatCompletion = await openai.chat.completions.create({ - messages: [{ role: 'user', content: 'Say this is a test' }], - model: 'gpt-3.5-turbo', - }, {"headers": { - "Cache-Control": "s-maxage=0" // 👈 sets ttl=0 - }}); -} +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) -main(); +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "no-cache": True # will not return a cached response + } +) ``` -### Override caching per `chat/completions` request -Caching can be switched on/off per `/chat/completions` request -- Caching **on** for individual completion - pass `caching=True`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": true - }' - ``` -- Caching **off** for individual completion - pass `caching=False`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": false - }' - ``` +**Turn on caching** +```python +import os +from openai import OpenAI -### Override caching per `/embeddings` request +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) -Caching can be switched on/off per `/embeddings` request -- Caching **on** for embedding - pass `caching=True`: - ```shell - curl --location 'http://0.0.0.0:8000/embeddings' \ - --header 'Content-Type: application/json' \ - --data ' { - "model": "text-embedding-ada-002", - "input": ["write a litellm poem"], - "caching": true - }' - ``` -- Caching **off** for completion - pass `caching=False`: - ```shell - curl --location 'http://0.0.0.0:8000/embeddings' \ - --header 'Content-Type: application/json' \ - --data ' { - "model": "text-embedding-ada-002", - "input": ["write a litellm poem"], - "caching": false - }' - ``` \ No newline at end of file +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "ttl": 600 # caches response for 10 minutes + } +) +``` + +```python +import os +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) + +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "s-max-age": 600 # only get responses cached within last 10 minutes + } +) +``` \ No newline at end of file diff --git a/litellm/caching.py b/litellm/caching.py index ce3930550..b4072bb8b 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -342,7 +342,38 @@ class Cache: else: cache_key = self.get_cache_key(*args, **kwargs) if cache_key is not None: + max_age = kwargs.get("cache", {}).get("s-max-age", float("inf")) cached_result = self.cache.get_cache(cache_key) + # Check if a timestamp was stored with the cached response + if ( + cached_result is not None + and isinstance(cached_result, dict) + and "timestamp" in cached_result + and max_age is not None + ): + timestamp = cached_result["timestamp"] + current_time = time.time() + + # Calculate age of the cached response + response_age = current_time - timestamp + + # Check if the cached response is older than the max-age + if response_age > max_age: + print_verbose( + f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s" + ) + return None # Cached response is too old + + # If the response is fresh, or there's no max-age requirement, return the cached response + # cached_response is in `b{} convert it to ModelResponse + cached_response = cached_result.get("response") + try: + cached_response = json.loads( + cached_response + ) # Convert string to dictionary + except: + cached_response = ast.literal_eval(cached_response) + return cached_response return cached_result except Exception as e: logging.debug(f"An exception occurred: {traceback.format_exc()}") @@ -367,7 +398,16 @@ class Cache: if cache_key is not None: if isinstance(result, litellm.ModelResponse): result = result.model_dump_json() - self.cache.set_cache(cache_key, result, **kwargs) + + ## Get Cache-Controls ## + if kwargs.get("cache", None) is not None and isinstance( + kwargs.get("cache"), dict + ): + for k, v in kwargs.get("cache").items(): + if k == "ttl": + kwargs["ttl"] = v + cached_data = {"timestamp": time.time(), "response": result} + self.cache.set_cache(cache_key, cached_data, **kwargs) except Exception as e: print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}") traceback.print_exc() diff --git a/litellm/main.py b/litellm/main.py index a487563ba..c5340e975 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -468,6 +468,7 @@ def completion( "preset_cache_key", "caching_groups", "ttl", + "cache", ] default_params = openai_params + litellm_params non_default_params = { @@ -2209,6 +2210,7 @@ def embedding( "preset_cache_key", "caching_groups", "ttl", + "cache", ] default_params = openai_params + litellm_params non_default_params = { @@ -2904,6 +2906,7 @@ def image_generation( "preset_cache_key", "caching_groups", "ttl", + "cache", ] default_params = openai_params + litellm_params non_default_params = { diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 45983557c..2b9c472af 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1,4 +1,4 @@ -import sys, os +import sys, os, uuid import time import traceback from dotenv import load_dotenv @@ -81,6 +81,46 @@ def test_caching_with_ttl(): pytest.fail(f"Error occurred: {e}") +def test_caching_with_cache_controls(): + try: + litellm.set_verbose = True + litellm.cache = Cache() + message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}] + ## TTL = 0 + response1 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0} + ) + response2 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10} + ) + print(f"response1: {response1}") + print(f"response2: {response2}") + assert ( + response2["choices"][0]["message"]["content"] + != response1["choices"][0]["message"]["content"] + ) + message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}] + ## TTL = 5 + response1 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5} + ) + response2 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5} + ) + print(f"response1: {response1}") + print(f"response2: {response2}") + assert ( + response2["choices"][0]["message"]["content"] + == response1["choices"][0]["message"]["content"] + ) + except Exception as e: + print(f"error occurred: {traceback.format_exc()}") + pytest.fail(f"Error occurred: {e}") + + +# test_caching_with_cache_controls() + + def test_caching_with_models_v2(): messages = [ {"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"} diff --git a/litellm/utils.py b/litellm/utils.py index e9afbfb1e..f62c79c22 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1971,12 +1971,12 @@ def client(original_function): print_verbose( f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}" ) - # if caching is false, don't run this + # if caching is false or cache["no-cache"]==True, don't run this if ( - kwargs.get("caching", None) is None and litellm.cache is not None - ) or kwargs.get( - "caching", False - ) == True: # allow users to control returning cached responses from the completion function + (kwargs.get("caching", None) is None and litellm.cache is not None) + or kwargs.get("caching", False) == True + or kwargs.get("cache", {}).get("no-cache", False) != True + ): # allow users to control returning cached responses from the completion function # checking cache print_verbose(f"INSIDE CHECKING CACHE") if ( @@ -2148,10 +2148,13 @@ def client(original_function): ) # if caching is false, don't run this if ( - kwargs.get("caching", None) is None and litellm.cache is not None - ) or kwargs.get( - "caching", False - ) == True: # allow users to control returning cached responses from the completion function + (kwargs.get("caching", None) is None and litellm.cache is not None) + or kwargs.get("caching", False) == True + or ( + kwargs.get("cache", None) is not None + and kwargs.get("cache").get("no-cache", False) != True + ) + ): # allow users to control returning cached responses from the completion function # checking cache print_verbose(f"INSIDE CHECKING CACHE") if (