forked from phoenix/litellm-mirror
fix(caching.py): support ttl, s-max-age, and no-cache cache controls
https://github.com/BerriAI/litellm/issues/1306
This commit is contained in:
parent
8772d87947
commit
8cee267a5b
5 changed files with 182 additions and 76 deletions
|
@ -1,3 +1,6 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Caching
|
# Caching
|
||||||
Cache LLM Responses
|
Cache LLM Responses
|
||||||
|
|
||||||
|
@ -41,7 +44,13 @@ REDIS_<redis-kwarg-name> = ""
|
||||||
$ litellm --config /path/to/config.yaml
|
$ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Using Caching - /chat/completions
|
## Using Caching - /chat/completions
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="chat_completions" label="/chat/completions">
|
||||||
|
|
||||||
Send the same request twice:
|
Send the same request twice:
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
@ -60,8 +69,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
"temperature": 0.7
|
"temperature": 0.7
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="embeddings" label="/embeddings">
|
||||||
|
|
||||||
## Using Caching - /embeddings
|
|
||||||
Send the same request twice:
|
Send the same request twice:
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||||
|
@ -78,6 +88,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||||
"input": ["write a litellm poem"]
|
"input": ["write a litellm poem"]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
## Advanced
|
## Advanced
|
||||||
### Set Cache Params on config.yaml
|
### Set Cache Params on config.yaml
|
||||||
|
@ -103,78 +115,86 @@ litellm_settings:
|
||||||
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
|
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
|
||||||
```
|
```
|
||||||
|
|
||||||
### Cache-Controls on requests
|
### Turn on / off caching per request.
|
||||||
|
|
||||||
Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`.
|
The proxy support 2 cache-controls:
|
||||||
|
|
||||||
Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
|
- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
|
||||||
|
- `s-max-age`: Will only accept cached responses that are within user-defined range (in seconds).
|
||||||
|
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
|
||||||
|
|
||||||
```javascript
|
**Turn off caching**
|
||||||
const { OpenAI } = require('openai');
|
|
||||||
|
|
||||||
const openai = new OpenAI({
|
```python
|
||||||
apiKey: "sk-1234", // This is the default and can be omitted
|
import os
|
||||||
baseURL: "http://0.0.0.0:8000"
|
from openai import OpenAI
|
||||||
});
|
|
||||||
|
|
||||||
async function main() {
|
client = OpenAI(
|
||||||
const chatCompletion = await openai.chat.completions.create({
|
# This is the default and can be omitted
|
||||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
model: 'gpt-3.5-turbo',
|
base_url="http://0.0.0.0:8000"
|
||||||
}, {"headers": {
|
)
|
||||||
"Cache-Control": "s-maxage=0" // 👈 sets ttl=0
|
|
||||||
}});
|
|
||||||
}
|
|
||||||
|
|
||||||
main();
|
chat_completion = client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Say this is a test",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
cache={
|
||||||
|
"no-cache": True # will not return a cached response
|
||||||
|
}
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Override caching per `chat/completions` request
|
**Turn on caching**
|
||||||
Caching can be switched on/off per `/chat/completions` request
|
|
||||||
- Caching **on** for individual completion - pass `caching=True`:
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
"caching": true
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
- Caching **off** for individual completion - pass `caching=False`:
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
"caching": false
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
### Override caching per `/embeddings` request
|
client = OpenAI(
|
||||||
|
# This is the default and can be omitted
|
||||||
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
|
base_url="http://0.0.0.0:8000"
|
||||||
|
)
|
||||||
|
|
||||||
Caching can be switched on/off per `/embeddings` request
|
chat_completion = client.chat.completions.create(
|
||||||
- Caching **on** for embedding - pass `caching=True`:
|
messages=[
|
||||||
```shell
|
{
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
"role": "user",
|
||||||
--header 'Content-Type: application/json' \
|
"content": "Say this is a test",
|
||||||
--data ' {
|
}
|
||||||
"model": "text-embedding-ada-002",
|
],
|
||||||
"input": ["write a litellm poem"],
|
model="gpt-3.5-turbo",
|
||||||
"caching": true
|
cache={
|
||||||
}'
|
"ttl": 600 # caches response for 10 minutes
|
||||||
```
|
}
|
||||||
- Caching **off** for completion - pass `caching=False`:
|
)
|
||||||
```shell
|
```
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
|
||||||
--header 'Content-Type: application/json' \
|
```python
|
||||||
--data ' {
|
import os
|
||||||
"model": "text-embedding-ada-002",
|
from openai import OpenAI
|
||||||
"input": ["write a litellm poem"],
|
|
||||||
"caching": false
|
client = OpenAI(
|
||||||
}'
|
# This is the default and can be omitted
|
||||||
```
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
|
base_url="http://0.0.0.0:8000"
|
||||||
|
)
|
||||||
|
|
||||||
|
chat_completion = client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Say this is a test",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
cache={
|
||||||
|
"s-max-age": 600 # only get responses cached within last 10 minutes
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
|
@ -342,7 +342,38 @@ class Cache:
|
||||||
else:
|
else:
|
||||||
cache_key = self.get_cache_key(*args, **kwargs)
|
cache_key = self.get_cache_key(*args, **kwargs)
|
||||||
if cache_key is not None:
|
if cache_key is not None:
|
||||||
|
max_age = kwargs.get("cache", {}).get("s-max-age", float("inf"))
|
||||||
cached_result = self.cache.get_cache(cache_key)
|
cached_result = self.cache.get_cache(cache_key)
|
||||||
|
# Check if a timestamp was stored with the cached response
|
||||||
|
if (
|
||||||
|
cached_result is not None
|
||||||
|
and isinstance(cached_result, dict)
|
||||||
|
and "timestamp" in cached_result
|
||||||
|
and max_age is not None
|
||||||
|
):
|
||||||
|
timestamp = cached_result["timestamp"]
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Calculate age of the cached response
|
||||||
|
response_age = current_time - timestamp
|
||||||
|
|
||||||
|
# Check if the cached response is older than the max-age
|
||||||
|
if response_age > max_age:
|
||||||
|
print_verbose(
|
||||||
|
f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
|
||||||
|
)
|
||||||
|
return None # Cached response is too old
|
||||||
|
|
||||||
|
# If the response is fresh, or there's no max-age requirement, return the cached response
|
||||||
|
# cached_response is in `b{} convert it to ModelResponse
|
||||||
|
cached_response = cached_result.get("response")
|
||||||
|
try:
|
||||||
|
cached_response = json.loads(
|
||||||
|
cached_response
|
||||||
|
) # Convert string to dictionary
|
||||||
|
except:
|
||||||
|
cached_response = ast.literal_eval(cached_response)
|
||||||
|
return cached_response
|
||||||
return cached_result
|
return cached_result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(f"An exception occurred: {traceback.format_exc()}")
|
logging.debug(f"An exception occurred: {traceback.format_exc()}")
|
||||||
|
@ -367,7 +398,16 @@ class Cache:
|
||||||
if cache_key is not None:
|
if cache_key is not None:
|
||||||
if isinstance(result, litellm.ModelResponse):
|
if isinstance(result, litellm.ModelResponse):
|
||||||
result = result.model_dump_json()
|
result = result.model_dump_json()
|
||||||
self.cache.set_cache(cache_key, result, **kwargs)
|
|
||||||
|
## Get Cache-Controls ##
|
||||||
|
if kwargs.get("cache", None) is not None and isinstance(
|
||||||
|
kwargs.get("cache"), dict
|
||||||
|
):
|
||||||
|
for k, v in kwargs.get("cache").items():
|
||||||
|
if k == "ttl":
|
||||||
|
kwargs["ttl"] = v
|
||||||
|
cached_data = {"timestamp": time.time(), "response": result}
|
||||||
|
self.cache.set_cache(cache_key, cached_data, **kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
|
@ -468,6 +468,7 @@ def completion(
|
||||||
"preset_cache_key",
|
"preset_cache_key",
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
|
"cache",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
@ -2209,6 +2210,7 @@ def embedding(
|
||||||
"preset_cache_key",
|
"preset_cache_key",
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
|
"cache",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
@ -2904,6 +2906,7 @@ def image_generation(
|
||||||
"preset_cache_key",
|
"preset_cache_key",
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
|
"cache",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import sys, os
|
import sys, os, uuid
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
@ -81,6 +81,46 @@ def test_caching_with_ttl():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_caching_with_cache_controls():
|
||||||
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
|
litellm.cache = Cache()
|
||||||
|
message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
|
||||||
|
## TTL = 0
|
||||||
|
response1 = completion(
|
||||||
|
model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0}
|
||||||
|
)
|
||||||
|
response2 = completion(
|
||||||
|
model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 10}
|
||||||
|
)
|
||||||
|
print(f"response1: {response1}")
|
||||||
|
print(f"response2: {response2}")
|
||||||
|
assert (
|
||||||
|
response2["choices"][0]["message"]["content"]
|
||||||
|
!= response1["choices"][0]["message"]["content"]
|
||||||
|
)
|
||||||
|
message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}]
|
||||||
|
## TTL = 5
|
||||||
|
response1 = completion(
|
||||||
|
model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5}
|
||||||
|
)
|
||||||
|
response2 = completion(
|
||||||
|
model="gpt-3.5-turbo", messages=messages, cache={"s-max-age": 5}
|
||||||
|
)
|
||||||
|
print(f"response1: {response1}")
|
||||||
|
print(f"response2: {response2}")
|
||||||
|
assert (
|
||||||
|
response2["choices"][0]["message"]["content"]
|
||||||
|
== response1["choices"][0]["message"]["content"]
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"error occurred: {traceback.format_exc()}")
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# test_caching_with_cache_controls()
|
||||||
|
|
||||||
|
|
||||||
def test_caching_with_models_v2():
|
def test_caching_with_models_v2():
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}
|
{"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}
|
||||||
|
|
|
@ -1971,12 +1971,12 @@ def client(original_function):
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
|
f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
|
||||||
)
|
)
|
||||||
# if caching is false, don't run this
|
# if caching is false or cache["no-cache"]==True, don't run this
|
||||||
if (
|
if (
|
||||||
kwargs.get("caching", None) is None and litellm.cache is not None
|
(kwargs.get("caching", None) is None and litellm.cache is not None)
|
||||||
) or kwargs.get(
|
or kwargs.get("caching", False) == True
|
||||||
"caching", False
|
or kwargs.get("cache", {}).get("no-cache", False) != True
|
||||||
) == True: # allow users to control returning cached responses from the completion function
|
): # allow users to control returning cached responses from the completion function
|
||||||
# checking cache
|
# checking cache
|
||||||
print_verbose(f"INSIDE CHECKING CACHE")
|
print_verbose(f"INSIDE CHECKING CACHE")
|
||||||
if (
|
if (
|
||||||
|
@ -2148,10 +2148,13 @@ def client(original_function):
|
||||||
)
|
)
|
||||||
# if caching is false, don't run this
|
# if caching is false, don't run this
|
||||||
if (
|
if (
|
||||||
kwargs.get("caching", None) is None and litellm.cache is not None
|
(kwargs.get("caching", None) is None and litellm.cache is not None)
|
||||||
) or kwargs.get(
|
or kwargs.get("caching", False) == True
|
||||||
"caching", False
|
or (
|
||||||
) == True: # allow users to control returning cached responses from the completion function
|
kwargs.get("cache", None) is not None
|
||||||
|
and kwargs.get("cache").get("no-cache", False) != True
|
||||||
|
)
|
||||||
|
): # allow users to control returning cached responses from the completion function
|
||||||
# checking cache
|
# checking cache
|
||||||
print_verbose(f"INSIDE CHECKING CACHE")
|
print_verbose(f"INSIDE CHECKING CACHE")
|
||||||
if (
|
if (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue