From b0827a87b2a66bb2e375b50de36eb48e43239e53 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 4 Jan 2024 11:41:23 +0530
Subject: [PATCH] fix(caching.py): support s-maxage param for cache controls

---
 docs/my-website/docs/proxy/caching.md | 4 ++--
 litellm/caching.py                    | 8 ++++++--
 litellm/tests/test_caching.py         | 8 +++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 77743e77c..9132854e9 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -161,7 +161,7 @@ litellm_settings:
 The proxy support 3 cache-controls:
 
 - `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds).
+- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
 - `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
@@ -237,7 +237,7 @@ chat_completion = client.chat.completions.create(
     ],
     model="gpt-3.5-turbo",
     cache={
-			"s-max-age": 600 # only get responses cached within last 10 minutes 
+			"s-maxage": 600 # only get responses cached within last 10 minutes 
 		}
 )
 ```
diff --git a/litellm/caching.py b/litellm/caching.py
index 0b1e18e46..67d57b6e8 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -11,6 +11,7 @@ import litellm
 import time, logging
 import json, traceback, ast, hashlib
 from typing import Optional, Literal, List, Union, Any
+from openai._models import BaseModel as OpenAIObject
 
 
 def print_verbose(print_statement):
@@ -472,7 +473,10 @@ class Cache:
             else:
                 cache_key = self.get_cache_key(*args, **kwargs)
             if cache_key is not None:
-                max_age = kwargs.get("cache", {}).get("s-max-age", float("inf"))
+                cache_control_args = kwargs.get("cache", {})
+                max_age = cache_control_args.get(
+                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+                )
                 cached_result = self.cache.get_cache(cache_key)
                 # Check if a timestamp was stored with the cached response
                 if (
@@ -529,7 +533,7 @@ class Cache:
             else:
                 cache_key = self.get_cache_key(*args, **kwargs)
             if cache_key is not None:
-                if isinstance(result, litellm.ModelResponse):
+                if isinstance(result, OpenAIObject):
                     result = result.model_dump_json()
 
                 ## Get Cache-Controls ##
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index c894331ba..3b7b1b37c 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -91,7 +91,7 @@ def test_caching_with_cache_controls():
             model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0}
         )
         response2 = completion(
-            model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10}
+            model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 10}
         )
         print(f"response1: {response1}")
         print(f"response2: {response2}")
@@ -105,7 +105,7 @@ def test_caching_with_cache_controls():
             model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5}
         )
         response2 = completion(
-            model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5}
+            model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 5}
         )
         print(f"response1: {response1}")
         print(f"response2: {response2}")
@@ -167,6 +167,8 @@ small text
 def test_embedding_caching():
     import time
 
+    # litellm.set_verbose = True
+
     litellm.cache = Cache()
     text_to_embed = [embedding_large_text]
     start_time = time.time()
@@ -182,7 +184,7 @@ def test_embedding_caching():
         model="text-embedding-ada-002", input=text_to_embed, caching=True
     )
     end_time = time.time()
-    print(f"embedding2: {embedding2}")
+    # print(f"embedding2: {embedding2}")
     print(f"Embedding 2 response time: {end_time - start_time} seconds")
 
     litellm.cache = None