forked from phoenix/litellm-mirror
docs
This commit is contained in:
parent
d9b17fb063
commit
f22e1daf8a
1 changed files with 64 additions and 3 deletions
|
@ -1,6 +1,6 @@
|
||||||
# LiteLLM - Caching
|
# LiteLLM - Caching
|
||||||
|
|
||||||
## LiteLLM Caches `completion()` and `embedding()` calls when switched on
|
## Caching `completion()` and `embedding()` calls when switched on
|
||||||
|
|
||||||
liteLLM implements exact match caching and supports the following Caching:
|
liteLLM implements exact match caching and supports the following Caching:
|
||||||
* In-Memory Caching [Default]
|
* In-Memory Caching [Default]
|
||||||
|
@ -8,8 +8,8 @@ liteLLM implements exact match caching and supports the following Caching:
|
||||||
* Redic Caching Hosted
|
* Redic Caching Hosted
|
||||||
* GPTCache
|
* GPTCache
|
||||||
|
|
||||||
## Usage
|
## Quick Start Usage - Completion
|
||||||
1. Caching - cache
|
Caching - cache
|
||||||
Keys in the cache are `model`, the following example will lead to a cache hit
|
Keys in the cache are `model`, the following example will lead to a cache hit
|
||||||
```python
|
```python
|
||||||
import litellm
|
import litellm
|
||||||
|
@ -23,3 +23,64 @@ response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "conten
|
||||||
|
|
||||||
# response1 == response2, response 1 is cached
|
# response1 == response2, response 1 is cached
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Using Redis Cache with LiteLLM
|
||||||
|
### Pre-requisites
|
||||||
|
Install redis
|
||||||
|
```
|
||||||
|
pip install redis
|
||||||
|
```
|
||||||
|
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||||
|
### Usage
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
from litellm import completion
|
||||||
|
from litellm.caching import Cache
|
||||||
|
litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)
|
||||||
|
|
||||||
|
# Make completion calls
|
||||||
|
response1 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}])
|
||||||
|
response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}])
|
||||||
|
|
||||||
|
# response1 == response2, response 1 is cached
|
||||||
|
```
|
||||||
|
|
||||||
|
## Caching with Streaming
|
||||||
|
LiteLLM can cache your streamed responses for you
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
from litellm import completion
|
||||||
|
from litellm.caching import Cache
|
||||||
|
litellm.cache = Cache()
|
||||||
|
|
||||||
|
# Make completion calls
|
||||||
|
response1 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}], stream=True)
|
||||||
|
for chunk in response1:
|
||||||
|
print(chunk)
|
||||||
|
response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}], stream=True)
|
||||||
|
for chunk in response2:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage - Embedding()
|
||||||
|
1. Caching - cache
|
||||||
|
Keys in the cache are `model`, the following example will lead to a cache hit
|
||||||
|
```python
|
||||||
|
import time
|
||||||
|
import litellm
|
||||||
|
from litellm import completion
|
||||||
|
from litellm.caching import Cache
|
||||||
|
litellm.cache = Cache()
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
embedding1 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5])
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Embedding 1 response time: {end_time - start_time} seconds")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
embedding2 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5])
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Embedding 2 response time: {end_time - start_time} seconds")
|
||||||
|
```
|
Loading…
Add table
Add a link
Reference in a new issue