From b0827a87b2a66bb2e375b50de36eb48e43239e53 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 4 Jan 2024 11:41:23 +0530 Subject: [PATCH] fix(caching.py): support s-maxage param for cache controls --- docs/my-website/docs/proxy/caching.md | 4 ++-- litellm/caching.py | 8 ++++++-- litellm/tests/test_caching.py | 8 +++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 77743e77c..9132854e9 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -161,7 +161,7 @@ litellm_settings: The proxy support 3 cache-controls: - `ttl`: Will cache the response for the user-defined amount of time (in seconds). -- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds). +- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds). - `no-cache`: Will not return a cached response, but instead call the actual endpoint. [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) @@ -237,7 +237,7 @@ chat_completion = client.chat.completions.create( ], model="gpt-3.5-turbo", cache={ - "s-max-age": 600 # only get responses cached within last 10 minutes + "s-maxage": 600 # only get responses cached within last 10 minutes } ) ``` diff --git a/litellm/caching.py b/litellm/caching.py index 0b1e18e46..67d57b6e8 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -11,6 +11,7 @@ import litellm import time, logging import json, traceback, ast, hashlib from typing import Optional, Literal, List, Union, Any +from openai._models import BaseModel as OpenAIObject def print_verbose(print_statement): @@ -472,7 +473,10 @@ class Cache: else: cache_key = self.get_cache_key(*args, **kwargs) if cache_key is not None: - max_age = kwargs.get("cache", {}).get("s-max-age", float("inf")) + cache_control_args = kwargs.get("cache", {}) + max_age = cache_control_args.get( + "s-max-age", cache_control_args.get("s-maxage", float("inf")) + ) cached_result = self.cache.get_cache(cache_key) # Check if a timestamp was stored with the cached response if ( @@ -529,7 +533,7 @@ class Cache: else: cache_key = self.get_cache_key(*args, **kwargs) if cache_key is not None: - if isinstance(result, litellm.ModelResponse): + if isinstance(result, OpenAIObject): result = result.model_dump_json() ## Get Cache-Controls ## diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index c894331ba..3b7b1b37c 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -91,7 +91,7 @@ def test_caching_with_cache_controls(): model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0} ) response2 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10} + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 10} ) print(f"response1: {response1}") print(f"response2: {response2}") @@ -105,7 +105,7 @@ def test_caching_with_cache_controls(): model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5} ) response2 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5} + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 5} ) print(f"response1: {response1}") print(f"response2: {response2}") @@ -167,6 +167,8 @@ small text def test_embedding_caching(): import time + # litellm.set_verbose = True + litellm.cache = Cache() text_to_embed = [embedding_large_text] start_time = time.time() @@ -182,7 +184,7 @@ def test_embedding_caching(): model="text-embedding-ada-002", input=text_to_embed, caching=True ) end_time = time.time() - print(f"embedding2: {embedding2}") + # print(f"embedding2: {embedding2}") print(f"Embedding 2 response time: {end_time - start_time} seconds") litellm.cache = None