From 8cee267a5b46635e46df30d0c940cb4fbdc07d66 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 3 Jan 2024 12:42:30 +0530
Subject: [PATCH] fix(caching.py): support ttl, s-max-age, and no-cache cache
 controls

https://github.com/BerriAI/litellm/issues/1306
---
 docs/my-website/docs/proxy/caching.md | 150 +++++++++++++++-----------
 litellm/caching.py                    |  42 +++++++-
 litellm/main.py                       |   3 +
 litellm/tests/test_caching.py         |  42 +++++++-
 litellm/utils.py                      |  21 ++--
 5 files changed, 182 insertions(+), 76 deletions(-)
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index bb8399f1e..b33ed235d 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Caching 
 Cache LLM Responses
 
@@ -41,7 +44,13 @@ REDIS_<redis-kwarg-name> = ""
 $ litellm --config /path/to/config.yaml
 ```
 
+
+
 ## Using Caching - /chat/completions
+
+<Tabs>
+<TabItem value="chat_completions" label="/chat/completions">
+
 Send the same request twice:
 ```shell
 curl http://0.0.0.0:8000/v1/chat/completions \
@@ -60,8 +69,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
      "temperature": 0.7
    }'
 ```
+</TabItem>
+<TabItem value="embeddings" label="/embeddings">
 
-## Using Caching - /embeddings
 Send the same request twice:
 ```shell
 curl --location 'http://0.0.0.0:8000/embeddings' \
@@ -78,6 +88,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
   "input": ["write a litellm poem"]
   }'
 ```
+</TabItem>
+</Tabs>
 
 ## Advanced
 ### Set Cache Params on config.yaml
@@ -103,78 +115,86 @@ litellm_settings:
     supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
 ```
 
-### Cache-Controls on requests 
+### Turn on / off caching per request.  
 
-Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`. 
+The proxy support 2 cache-controls:
 
-Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
+- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
+- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
 
-```javascript
-const { OpenAI } = require('openai');
+**Turn off caching**
 
-const openai = new OpenAI({
-  apiKey: "sk-1234", // This is the default and can be omitted
-  baseURL: "http://0.0.0.0:8000"
-});
+```python
+import os
+from openai import OpenAI
 
-async function main() {
-  const chatCompletion = await openai.chat.completions.create({
-    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'gpt-3.5-turbo',
-  }, {"headers": {
-    "Cache-Control": "s-maxage=0" // 👈 sets ttl=0
-  }});
-}
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)
 
-main();
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"no-cache": True # will not return a cached response 
+		}
+)
 ```
 
-### Override caching per `chat/completions` request
-Caching can be switched on/off per `/chat/completions` request
-- Caching **on** for individual completion - pass `caching=True`:
-  ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "write a poem about litellm!"}],
-     "temperature": 0.7,
-     "caching": true
-   }'
-  ```
-- Caching **off** for individual completion - pass `caching=False`:
-  ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "write a poem about litellm!"}],
-     "temperature": 0.7,
-     "caching": false
-   }'
-  ```
+**Turn on caching**
 
+```python
+import os
+from openai import OpenAI
 
-### Override caching per `/embeddings` request
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)
 
-Caching can be switched on/off per `/embeddings` request
-- Caching **on** for embedding - pass `caching=True`:
-  ```shell
-  curl --location 'http://0.0.0.0:8000/embeddings' \
-    --header 'Content-Type: application/json' \
-    --data ' {
-    "model": "text-embedding-ada-002",
-    "input": ["write a litellm poem"],
-    "caching": true
-    }'
-  ```
-- Caching **off** for completion - pass `caching=False`:
-  ```shell
-    curl --location 'http://0.0.0.0:8000/embeddings' \
-    --header 'Content-Type: application/json' \
-    --data ' {
-    "model": "text-embedding-ada-002",
-    "input": ["write a litellm poem"],
-    "caching": false
-    }'
-  ```
\ No newline at end of file
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"ttl": 600 # caches response for 10 minutes 
+		}
+)
+```
+
+```python
+import os
+from openai import OpenAI
+
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"s-max-age": 600 # only get responses cached within last 10 minutes 
+		}
+)
+```
\ No newline at end of file
diff --git a/litellm/caching.py b/litellm/caching.py
index ce3930550..b4072bb8b 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -342,7 +342,38 @@ class Cache:
             else:
                 cache_key = self.get_cache_key(*args, **kwargs)
             if cache_key is not None:
+                max_age = kwargs.get("cache", {}).get("s-max-age", float("inf"))
                 cached_result = self.cache.get_cache(cache_key)
+                # Check if a timestamp was stored with the cached response
+                if (
+                    cached_result is not None
+                    and isinstance(cached_result, dict)
+                    and "timestamp" in cached_result
+                    and max_age is not None
+                ):
+                    timestamp = cached_result["timestamp"]
+                    current_time = time.time()
+
+                    # Calculate age of the cached response
+                    response_age = current_time - timestamp
+
+                    # Check if the cached response is older than the max-age
+                    if response_age > max_age:
+                        print_verbose(
+                            f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
+                        )
+                        return None  # Cached response is too old
+
+                    # If the response is fresh, or there's no max-age requirement, return the cached response
+                    # cached_response is in `b{} convert it to ModelResponse
+                    cached_response = cached_result.get("response")
+                    try:
+                        cached_response = json.loads(
+                            cached_response
+                        )  # Convert string to dictionary
+                    except:
+                        cached_response = ast.literal_eval(cached_response)
+                    return cached_response
                 return cached_result
         except Exception as e:
             logging.debug(f"An exception occurred: {traceback.format_exc()}")
@@ -367,7 +398,16 @@ class Cache:
             if cache_key is not None:
                 if isinstance(result, litellm.ModelResponse):
                     result = result.model_dump_json()
-                self.cache.set_cache(cache_key, result, **kwargs)
+
+                ## Get Cache-Controls ##
+                if kwargs.get("cache", None) is not None and isinstance(
+                    kwargs.get("cache"), dict
+                ):
+                    for k, v in kwargs.get("cache").items():
+                        if k == "ttl":
+                            kwargs["ttl"] = v
+                cached_data = {"timestamp": time.time(), "response": result}
+                self.cache.set_cache(cache_key, cached_data, **kwargs)
         except Exception as e:
             print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
             traceback.print_exc()
diff --git a/litellm/main.py b/litellm/main.py
index a487563ba..c5340e975 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -468,6 +468,7 @@ def completion(
         "preset_cache_key",
         "caching_groups",
         "ttl",
+        "cache",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
@@ -2209,6 +2210,7 @@ def embedding(
         "preset_cache_key",
         "caching_groups",
         "ttl",
+        "cache",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
@@ -2904,6 +2906,7 @@ def image_generation(
         "preset_cache_key",
         "caching_groups",
         "ttl",
+        "cache",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 45983557c..2b9c472af 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1,4 +1,4 @@
-import sys, os
+import sys, os, uuid
 import time
 import traceback
 from dotenv import load_dotenv
@@ -81,6 +81,46 @@ def test_caching_with_ttl():
         pytest.fail(f"Error occurred: {e}")
 
 
+def test_caching_with_cache_controls():
+    try:
+        litellm.set_verbose = True
+        litellm.cache = Cache()
+        message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
+        ## TTL = 0
+        response1 = completion(
+            model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0}
+        )
+        response2 = completion(
+            model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10}
+        )
+        print(f"response1: {response1}")
+        print(f"response2: {response2}")
+        assert (
+            response2["choices"][0]["message"]["content"]
+            != response1["choices"][0]["message"]["content"]
+        )
+        message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
+        ## TTL = 5
+        response1 = completion(
+            model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5}
+        )
+        response2 = completion(
+            model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5}
+        )
+        print(f"response1: {response1}")
+        print(f"response2: {response2}")
+        assert (
+            response2["choices"][0]["message"]["content"]
+            == response1["choices"][0]["message"]["content"]
+        )
+    except Exception as e:
+        print(f"error occurred: {traceback.format_exc()}")
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_caching_with_cache_controls()
+
+
 def test_caching_with_models_v2():
     messages = [
         {"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}
diff --git a/litellm/utils.py b/litellm/utils.py
index e9afbfb1e..f62c79c22 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1971,12 +1971,12 @@ def client(original_function):
             print_verbose(
                 f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
             )
-            # if caching is false, don't run this
+            # if caching is false or cache["no-cache"]==True, don't run this
             if (
-                kwargs.get("caching", None) is None and litellm.cache is not None
-            ) or kwargs.get(
-                "caching", False
-            ) == True:  # allow users to control returning cached responses from the completion function
+                (kwargs.get("caching", None) is None and litellm.cache is not None)
+                or kwargs.get("caching", False) == True
+                or kwargs.get("cache", {}).get("no-cache", False) != True
+            ):  # allow users to control returning cached responses from the completion function
                 # checking cache
                 print_verbose(f"INSIDE CHECKING CACHE")
                 if (
@@ -2148,10 +2148,13 @@ def client(original_function):
             )
             # if caching is false, don't run this
             if (
-                kwargs.get("caching", None) is None and litellm.cache is not None
-            ) or kwargs.get(
-                "caching", False
-            ) == True:  # allow users to control returning cached responses from the completion function
+                (kwargs.get("caching", None) is None and litellm.cache is not None)
+                or kwargs.get("caching", False) == True
+                or (
+                    kwargs.get("cache", None) is not None
+                    and kwargs.get("cache").get("no-cache", False) != True
+                )
+            ):  # allow users to control returning cached responses from the completion function
                 # checking cache
                 print_verbose(f"INSIDE CHECKING CACHE")
                 if (