(ci/cd) add all docs changes to this branch

This commit is contained in:
ishaan-jaff 2024-01-06 15:01:23 +05:30
parent cd98d256b5
commit 9bb2c13119
13 changed files with 427 additions and 122 deletions

View file

@ -1,18 +1,29 @@
# Redis Cache
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
[**See Code**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/caching.py#L71)
# Caching - In-Memory, Redis, s3
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
## Initialize Cache - In Memory, Redis, s3 Bucket
<Tabs>
<TabItem value="redis" label="redis-cache">
### Pre-requisites
Install redis
```shell
pip install redis
```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
### Quick Start
```python
import litellm
from litellm import completion
from litellm.caching import Cache
litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)
# Make completion calls
@ -28,6 +39,78 @@ response2 = completion(
# response1 == response2, response 1 is cached
```
</TabItem>
<TabItem value="s3" label="s3-cache">
Install boto3
```shell
pip install boto3
```
Set AWS environment variables
```shell
AWS_ACCESS_KEY_ID = "AKI*******"
AWS_SECRET_ACCESS_KEY = "WOl*****"
```
### Quick Start
```python
import litellm
from litellm import completion
from litellm.caching import Cache
# pass s3-bucket name
litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
# Make completion calls
response1 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
)
response2 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
)
# response1 == response2, response 1 is cached
```
</TabItem>
<TabItem value="in-mem" label="in memory cache">
### Quick Start
```python
import litellm
from litellm import completion
from litellm.caching import Cache
litellm.cache = Cache()
# Make completion calls
response1 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
caching=True
)
response2 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}],
caching=True
)
# response1 == response2, response 1 is cached
```
</TabItem>
</Tabs>
## Cache Context Manager - Enable, Disable, Update Cache
Use the context manager for easily enabling, disabling & updating the litellm cache
@ -103,35 +186,34 @@ litellm.cache = cache # set litellm.cache to your cache
## Cache Initialization Parameters
#### `type` (str, optional)
```python
def __init__(
self,
type: Optional[Literal["local", "redis", "s3"]] = "local",
supported_call_types: Optional[
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
# redis cache params
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
The type of cache to initialize. It can be either "local" or "redis". Defaults to "local".
#### `host` (str, optional)
The host address for the Redis cache. This parameter is required if the `type` is set to "redis".
#### `port` (int, optional)
The port number for the Redis cache. This parameter is required if the `type` is set to "redis".
#### `password` (str, optional)
The password for the Redis cache. This parameter is required if the `type` is set to "redis".
#### `supported_call_types` (list, optional)
A list of call types to cache for. Defaults to caching for all call types. The available call types are:
- "completion"
- "acompletion"
- "embedding"
- "aembedding"
#### `**kwargs` (additional keyword arguments)
Additional keyword arguments are accepted for the initialization of the Redis cache using the `redis.Redis()` constructor. These arguments allow you to fine-tune the Redis cache configuration based on your specific needs.
# s3 Bucket, boto3 configuration
s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None,
s3_api_version: Optional[str] = None,
s3_use_ssl: Optional[bool] = True,
s3_verify: Optional[Union[bool, str]] = None,
s3_endpoint_url: Optional[str] = None,
s3_aws_access_key_id: Optional[str] = None,
s3_aws_secret_access_key: Optional[str] = None,
s3_aws_session_token: Optional[str] = None,
s3_config: Optional[Any] = None,
**kwargs,
):
```
## Logging

View file

@ -396,7 +396,48 @@ response = completion(
)
```
## OpenAI Proxy
Track spend across multiple projects/people
The proxy provides:
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
### Quick Start Proxy - CLI
```shell
pip install 'litellm[proxy]'
```
#### Step 1: Start litellm proxy
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
```
#### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
## More details
* [exception mapping](./exception_mapping.md)
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)

View file

@ -24,6 +24,7 @@ model_list:
general_settings:
alerting: ["slack"]
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
environment_variables:
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"

View file

@ -1,8 +1,21 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Caching
Cache LLM Responses
## Quick Start
LiteLLM supports:
- In Memory Cache
- Redis Cache
- s3 Bucket Cache
## Quick Start - Redis, s3 Cache
<Tabs>
<TabItem value="redis" label="redis cache">
Caching can be enabled by adding the `cache` key in the `config.yaml`
### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
@ -40,8 +53,45 @@ REDIS_<redis-kwarg-name> = ""
```shell
$ litellm --config /path/to/config.yaml
```
</TabItem>
<TabItem value="s3" label="s3 cache">
### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
- model_name: text-embedding-ada-002
litellm_params:
model: text-embedding-ada-002
litellm_settings:
set_verbose: True
cache: True # set cache responses to True
cache_params: # set cache params for s3
type: s3
s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
### Step 2: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
</TabItem>
</Tabs>
## Using Caching - /chat/completions
<Tabs>
<TabItem value="chat_completions" label="/chat/completions">
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
@ -60,8 +110,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
```
</TabItem>
<TabItem value="embeddings" label="/embeddings">
## Using Caching - /embeddings
Send the same request twice:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
@ -78,6 +129,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
"input": ["write a litellm poem"]
}'
```
</TabItem>
</Tabs>
## Advanced
### Set Cache Params on config.yaml
@ -103,78 +156,121 @@ litellm_settings:
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
```
### Cache-Controls on requests
### Turn on / off caching per request.
Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`.
The proxy support 3 cache-controls:
Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
```javascript
const { OpenAI } = require('openai');
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
const openai = new OpenAI({
apiKey: "sk-1234", // This is the default and can be omitted
baseURL: "http://0.0.0.0:8000"
});
**Turn off caching**
async function main() {
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'gpt-3.5-turbo',
}, {"headers": {
"Cache-Control": "s-maxage=0" // 👈 sets ttl=0
}});
}
```python
import os
from openai import OpenAI
main();
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
cache={
"no-cache": True # will not return a cached response
}
)
```
### Override caching per `chat/completions` request
Caching can be switched on/off per `/chat/completions` request
- Caching **on** for individual completion - pass `caching=True`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
"temperature": 0.7,
"caching": true
}'
```
- Caching **off** for individual completion - pass `caching=False`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
"temperature": 0.7,
"caching": false
}'
```
**Turn on caching**
```python
import os
from openai import OpenAI
### Override caching per `/embeddings` request
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
)
Caching can be switched on/off per `/embeddings` request
- Caching **on** for embedding - pass `caching=True`:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
"input": ["write a litellm poem"],
"caching": true
}'
```
- Caching **off** for completion - pass `caching=False`:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
"input": ["write a litellm poem"],
"caching": false
}'
```
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
cache={
"ttl": 600 # caches response for 10 minutes
}
)
```
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
cache={
"s-maxage": 600 # only get responses cached within last 10 minutes
}
)
```
## Supported `cache_params`
```yaml
cache_params:
# Type of cache (options: "local", "redis", "s3")
type: s3
# List of litellm call types to cache for
# Options: "completion", "acompletion", "embedding", "aembedding"
supported_call_types:
- completion
- acompletion
- embedding
- aembedding
# Redis cache parameters
host: localhost # Redis server hostname or IP address
port: "6379" # Redis server port (as a string)
password: secret_password # Redis server password
# S3 cache parameters
s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket
s3_region_name: us-west-2 # AWS region of the S3 bucket
s3_api_version: 2006-03-01 # AWS S3 API version
s3_use_ssl: true # Use SSL for S3 connections (options: true, false)
s3_verify: true # SSL certificate verification for S3 connections (options: true, false)
s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL
s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials
```

View file

@ -251,7 +251,7 @@ s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for h
1. Install Proxy dependencies
```bash
$ pip install litellm[proxy] litellm[extra_proxy]
$ pip install 'litellm[proxy]' 'litellm[extra_proxy]'
```
2. Save Azure details in your environment

View file

@ -19,12 +19,6 @@ LiteLLM Server manages:
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
```shell
$ pip install litellm[proxy]
```
If this fails try running
```shell
$ pip install 'litellm[proxy]'
```
@ -190,6 +184,13 @@ $ export OPENAI_API_KEY=my-api-key
```shell
$ litellm --model gpt-3.5-turbo
```
</TabItem>
<TabItem value="ollama" label="Ollama">
```
$ litellm --model ollama/<ollama-model-name>
```
</TabItem>
<TabItem value="openai-proxy" label="OpenAI Compatible Endpoint">

View file

@ -0,0 +1,43 @@
# Post-Call Rules
Use this to fail a request based on the output of an llm api call.
## Quick Start
### Step 1: Create a file (e.g. post_call_rules.py)
```python
def my_custom_rule(input): # receives the model response
if len(input) < 5: # trigger fallback if the model response is too short
return False
return True
```
### Step 2. Point it to your proxy
```python
litellm_settings:
post_call_rules: post_call_rules.my_custom_rule
num_retries: 3
```
### Step 3. Start + test your proxy
```bash
$ litellm /path/to/config.yaml
```
```bash
curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "deepseek-coder",
"messages": [{"role":"user","content":"What llm are you?"}],
"temperature": 0.7,
"max_tokens": 10,
}'
```
---
This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.

View file

@ -13,7 +13,7 @@ Docs outdated. New docs 👉 [here](./simple_proxy)
## Usage
```shell
pip install litellm[proxy]
pip install 'litellm[proxy]'
```
```shell
$ litellm --model ollama/codellama

View file

@ -40,7 +40,7 @@ litellm.get_secret("your-test-key")
1. Install Proxy dependencies
```bash
pip install litellm[proxy] litellm[extra_proxy]
pip install 'litellm[proxy]' 'litellm[extra_proxy]'
```
2. Save Azure details in your environment

View file

@ -16,7 +16,7 @@ LiteLLM Server manages:
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
```shell
$ pip install litellm[proxy]
$ pip install 'litellm[proxy]'
```
```shell

View file

@ -112,6 +112,7 @@ const sidebars = {
"proxy/reliability",
"proxy/health",
"proxy/call_hooks",
"proxy/rules",
"proxy/caching",
"proxy/alerting",
"proxy/logging",
@ -167,20 +168,7 @@ const sidebars = {
`observability/telemetry`,
],
},
{
type: "category",
label: "Caching",
link: {
type: 'generated-index',
title: 'Providers',
description: 'Learn how to deploy + call models from different providers on LiteLLM',
slug: '/caching',
},
items: [
"caching/local_caching",
"caching/redis_cache",
],
},
"caching/redis_cache",
{
type: "category",
label: "LangChain, LlamaIndex Integration",

View file

@ -375,6 +375,45 @@ response = completion(
Need a dedicated key? Email us @ krrish@berri.ai
## OpenAI Proxy
Track spend across multiple projects/people
The proxy provides:
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
### Quick Start Proxy - CLI
```shell
pip install 'litellm[proxy]'
```
#### Step 1: Start litellm proxy
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
```
#### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
## More details
* [exception mapping](./exception_mapping.md)

View file

@ -111,6 +111,13 @@
"litellm_provider": "openai",
"mode": "embedding"
},
"text-embedding-ada-002-v2": {
"max_tokens": 8191,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.000000,
"litellm_provider": "openai",
"mode": "embedding"
},
"256-x-256/dall-e-2": {
"mode": "image_generation",
"input_cost_per_pixel": 0.00000024414,
@ -242,6 +249,13 @@
"litellm_provider": "azure",
"mode": "chat"
},
"azure/ada": {
"max_tokens": 8191,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.000000,
"litellm_provider": "azure",
"mode": "embedding"
},
"azure/text-embedding-ada-002": {
"max_tokens": 8191,
"input_cost_per_token": 0.0000001,
@ -1630,4 +1644,4 @@
"mode": "embedding"
}
}
}