fix(caching.py): support ttl, s-max-age, and no-cache cache controls

https://github.com/BerriAI/litellm/issues/1306
This commit is contained in:
Krrish Dholakia 2024-01-03 12:42:30 +05:30
parent 8772d87947
commit 8cee267a5b
5 changed files with 182 additions and 76 deletions

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Caching # Caching
Cache LLM Responses Cache LLM Responses
@ -41,7 +44,13 @@ REDIS_<redis-kwarg-name> = ""
$ litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
## Using Caching - /chat/completions ## Using Caching - /chat/completions
<Tabs>
<TabItem value="chat_completions" label="/chat/completions">
Send the same request twice: Send the same request twice:
```shell ```shell
curl http://0.0.0.0:8000/v1/chat/completions \ curl http://0.0.0.0:8000/v1/chat/completions \
@ -60,8 +69,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7 "temperature": 0.7
}' }'
``` ```
</TabItem>
<TabItem value="embeddings" label="/embeddings">
## Using Caching - /embeddings
Send the same request twice: Send the same request twice:
```shell ```shell
curl --location 'http://0.0.0.0:8000/embeddings' \ curl --location 'http://0.0.0.0:8000/embeddings' \
@ -78,6 +88,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
"input": ["write a litellm poem"] "input": ["write a litellm poem"]
}' }'
``` ```
</TabItem>
</Tabs>
## Advanced ## Advanced
### Set Cache Params on config.yaml ### Set Cache Params on config.yaml
@ -103,78 +115,86 @@ litellm_settings:
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
``` ```
### Cache-Controls on requests ### Turn on / off caching per request.
Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`. The proxy support 2 cache-controls:
Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218 - `ttl`: Will cache the response for the user-defined amount of time (in seconds).
- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
```javascript **Turn off caching**
const { OpenAI } = require('openai');
const openai = new OpenAI({ ```python
apiKey: "sk-1234", // This is the default and can be omitted import os
baseURL: "http://0.0.0.0:8000" from openai import OpenAI
});
async function main() { client = OpenAI(
const chatCompletion = await openai.chat.completions.create({ # This is the default and can be omitted
messages: [{ role: 'user', content: 'Say this is a test' }], api_key=os.environ.get("OPENAI_API_KEY"),
model: 'gpt-3.5-turbo', base_url="http://0.0.0.0:8000"
}, {"headers": { )
"Cache-Control": "s-maxage=0" // 👈 sets ttl=0
}});
}
main(); chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
cache={
"no-cache": True # will not return a cached response
}
)
``` ```
### Override caching per `chat/completions` request **Turn on caching**
Caching can be switched on/off per `/chat/completions` request
- Caching **on** for individual completion - pass `caching=True`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
"temperature": 0.7,
"caching": true
}'
```
- Caching **off** for individual completion - pass `caching=False`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
"temperature": 0.7,
"caching": false
}'
```
```python
import os
from openai import OpenAI
### Override caching per `/embeddings` request client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
)
Caching can be switched on/off per `/embeddings` request chat_completion = client.chat.completions.create(
- Caching **on** for embedding - pass `caching=True`: messages=[
```shell {
curl --location 'http://0.0.0.0:8000/embeddings' \ "role": "user",
--header 'Content-Type: application/json' \ "content": "Say this is a test",
--data ' { }
"model": "text-embedding-ada-002", ],
"input": ["write a litellm poem"], model="gpt-3.5-turbo",
"caching": true cache={
}' "ttl": 600 # caches response for 10 minutes
``` }
- Caching **off** for completion - pass `caching=False`: )
```shell ```
curl --location 'http://0.0.0.0:8000/embeddings' \
--header 'Content-Type: application/json' \ ```python
--data ' { import os
"model": "text-embedding-ada-002", from openai import OpenAI
"input": ["write a litellm poem"],
"caching": false client = OpenAI(
}' # This is the default and can be omitted
``` api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
cache={
"s-max-age": 600 # only get responses cached within last 10 minutes
}
)
```

View file

@ -342,7 +342,38 @@ class Cache:
else: else:
cache_key = self.get_cache_key(*args, **kwargs) cache_key = self.get_cache_key(*args, **kwargs)
if cache_key is not None: if cache_key is not None:
max_age = kwargs.get("cache", {}).get("s-max-age", float("inf"))
cached_result = self.cache.get_cache(cache_key) cached_result = self.cache.get_cache(cache_key)
# Check if a timestamp was stored with the cached response
if (
cached_result is not None
and isinstance(cached_result, dict)
and "timestamp" in cached_result
and max_age is not None
):
timestamp = cached_result["timestamp"]
current_time = time.time()
# Calculate age of the cached response
response_age = current_time - timestamp
# Check if the cached response is older than the max-age
if response_age > max_age:
print_verbose(
f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
)
return None # Cached response is too old
# If the response is fresh, or there's no max-age requirement, return the cached response
# cached_response is in `b{} convert it to ModelResponse
cached_response = cached_result.get("response")
try:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response)
return cached_response
return cached_result return cached_result
except Exception as e: except Exception as e:
logging.debug(f"An exception occurred: {traceback.format_exc()}") logging.debug(f"An exception occurred: {traceback.format_exc()}")
@ -367,7 +398,16 @@ class Cache:
if cache_key is not None: if cache_key is not None:
if isinstance(result, litellm.ModelResponse): if isinstance(result, litellm.ModelResponse):
result = result.model_dump_json() result = result.model_dump_json()
self.cache.set_cache(cache_key, result, **kwargs)
## Get Cache-Controls ##
if kwargs.get("cache", None) is not None and isinstance(
kwargs.get("cache"), dict
):
for k, v in kwargs.get("cache").items():
if k == "ttl":
kwargs["ttl"] = v
cached_data = {"timestamp": time.time(), "response": result}
self.cache.set_cache(cache_key, cached_data, **kwargs)
except Exception as e: except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}") print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc() traceback.print_exc()

View file

@ -468,6 +468,7 @@ def completion(
"preset_cache_key", "preset_cache_key",
"caching_groups", "caching_groups",
"ttl", "ttl",
"cache",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {
@ -2209,6 +2210,7 @@ def embedding(
"preset_cache_key", "preset_cache_key",
"caching_groups", "caching_groups",
"ttl", "ttl",
"cache",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {
@ -2904,6 +2906,7 @@ def image_generation(
"preset_cache_key", "preset_cache_key",
"caching_groups", "caching_groups",
"ttl", "ttl",
"cache",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {

View file

@ -1,4 +1,4 @@
import sys, os import sys, os, uuid
import time import time
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
@ -81,6 +81,46 @@ def test_caching_with_ttl():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_caching_with_cache_controls():
try:
litellm.set_verbose = True
litellm.cache = Cache()
message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
## TTL = 0
response1 = completion(
model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0}
)
response2 = completion(
model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10}
)
print(f"response1: {response1}")
print(f"response2: {response2}")
assert (
response2["choices"][0]["message"]["content"]
!= response1["choices"][0]["message"]["content"]
)
message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
## TTL = 5
response1 = completion(
model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5}
)
response2 = completion(
model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5}
)
print(f"response1: {response1}")
print(f"response2: {response2}")
assert (
response2["choices"][0]["message"]["content"]
== response1["choices"][0]["message"]["content"]
)
except Exception as e:
print(f"error occurred: {traceback.format_exc()}")
pytest.fail(f"Error occurred: {e}")
# test_caching_with_cache_controls()
def test_caching_with_models_v2(): def test_caching_with_models_v2():
messages = [ messages = [
{"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"} {"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}

View file

@ -1971,12 +1971,12 @@ def client(original_function):
print_verbose( print_verbose(
f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}" f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
) )
# if caching is false, don't run this # if caching is false or cache["no-cache"]==True, don't run this
if ( if (
kwargs.get("caching", None) is None and litellm.cache is not None (kwargs.get("caching", None) is None and litellm.cache is not None)
) or kwargs.get( or kwargs.get("caching", False) == True
"caching", False or kwargs.get("cache", {}).get("no-cache", False) != True
) == True: # allow users to control returning cached responses from the completion function ): # allow users to control returning cached responses from the completion function
# checking cache # checking cache
print_verbose(f"INSIDE CHECKING CACHE") print_verbose(f"INSIDE CHECKING CACHE")
if ( if (
@ -2148,10 +2148,13 @@ def client(original_function):
) )
# if caching is false, don't run this # if caching is false, don't run this
if ( if (
kwargs.get("caching", None) is None and litellm.cache is not None (kwargs.get("caching", None) is None and litellm.cache is not None)
) or kwargs.get( or kwargs.get("caching", False) == True
"caching", False or (
) == True: # allow users to control returning cached responses from the completion function kwargs.get("cache", None) is not None
and kwargs.get("cache").get("no-cache", False) != True
)
): # allow users to control returning cached responses from the completion function
# checking cache # checking cache
print_verbose(f"INSIDE CHECKING CACHE") print_verbose(f"INSIDE CHECKING CACHE")
if ( if (