diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index fc799bcc39..3d70c5e3db 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,18 +1,29 @@
-# Redis Cache
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
-[**See Code**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/caching.py#L71)
+# Caching - In-Memory, Redis, s3
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
+
+## Initialize Cache - In Memory, Redis, s3 Bucket
+
+
+
+
+
-### Pre-requisites
Install redis
```shell
pip install redis
```
+
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
### Quick Start
```python
import litellm
from litellm import completion
from litellm.caching import Cache
+
litellm.cache = Cache(type="redis", host=, port=, password=)
# Make completion calls
@@ -28,6 +39,78 @@ response2 = completion(
# response1 == response2, response 1 is cached
```
+
+
+
+
+
+Install boto3
+```shell
+pip install boto3
+```
+
+Set AWS environment variables
+
+```shell
+AWS_ACCESS_KEY_ID = "AKI*******"
+AWS_SECRET_ACCESS_KEY = "WOl*****"
+```
+### Quick Start
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+# pass s3-bucket name
+litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
+
+# Make completion calls
+response1 = completion(
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Tell me a joke."}]
+)
+response2 = completion(
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Tell me a joke."}]
+)
+
+# response1 == response2, response 1 is cached
+```
+
+
+
+
+
+
+### Quick Start
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+litellm.cache = Cache()
+
+# Make completion calls
+response1 = completion(
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Tell me a joke."}]
+ caching=True
+)
+response2 = completion(
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Tell me a joke."}],
+ caching=True
+)
+
+# response1 == response2, response 1 is cached
+
+```
+
+
+
+
+
+
## Cache Context Manager - Enable, Disable, Update Cache
Use the context manager for easily enabling, disabling & updating the litellm cache
@@ -103,35 +186,34 @@ litellm.cache = cache # set litellm.cache to your cache
## Cache Initialization Parameters
-#### `type` (str, optional)
+```python
+def __init__(
+ self,
+ type: Optional[Literal["local", "redis", "s3"]] = "local",
+ supported_call_types: Optional[
+ List[Literal["completion", "acompletion", "embedding", "aembedding"]]
+ ] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
+
+ # redis cache params
+ host: Optional[str] = None,
+ port: Optional[str] = None,
+ password: Optional[str] = None,
-The type of cache to initialize. It can be either "local" or "redis". Defaults to "local".
-
-#### `host` (str, optional)
-
-The host address for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `port` (int, optional)
-
-The port number for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `password` (str, optional)
-
-The password for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `supported_call_types` (list, optional)
-
-A list of call types to cache for. Defaults to caching for all call types. The available call types are:
-
-- "completion"
-- "acompletion"
-- "embedding"
-- "aembedding"
-
-#### `**kwargs` (additional keyword arguments)
-
-Additional keyword arguments are accepted for the initialization of the Redis cache using the `redis.Redis()` constructor. These arguments allow you to fine-tune the Redis cache configuration based on your specific needs.
+ # s3 Bucket, boto3 configuration
+ s3_bucket_name: Optional[str] = None,
+ s3_region_name: Optional[str] = None,
+ s3_api_version: Optional[str] = None,
+ s3_use_ssl: Optional[bool] = True,
+ s3_verify: Optional[Union[bool, str]] = None,
+ s3_endpoint_url: Optional[str] = None,
+ s3_aws_access_key_id: Optional[str] = None,
+ s3_aws_secret_access_key: Optional[str] = None,
+ s3_aws_session_token: Optional[str] = None,
+ s3_config: Optional[Any] = None,
+ **kwargs,
+):
+```
## Logging
diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md
index f2329be1e6..f6bd7bc42b 100644
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@@ -396,7 +396,48 @@ response = completion(
)
```
+## OpenAI Proxy
+
+Track spend across multiple projects/people
+
+The proxy provides:
+1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
+2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
+3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
+4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
+
+### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
+
+### Quick Start Proxy - CLI
+
+```shell
+pip install 'litellm[proxy]'
+```
+
+#### Step 1: Start litellm proxy
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+
+#INFO: Proxy running on http://0.0.0.0:8000
+```
+
+#### Step 2: Make ChatCompletions Request to Proxy
+```python
+import openai # openai v1.0.0+
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+ {
+ "role": "user",
+ "content": "this is a test request, write a short poem"
+ }
+])
+
+print(response)
+```
+
+
## More details
* [exception mapping](./exception_mapping.md)
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
-* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
\ No newline at end of file
+* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md
index a4a2ab6c8e..699f6d6ffa 100644
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@@ -24,6 +24,7 @@ model_list:
general_settings:
alerting: ["slack"]
+ alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
environment_variables:
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index bb8399f1e6..9132854e91 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -1,8 +1,21 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
# Caching
Cache LLM Responses
-## Quick Start
+LiteLLM supports:
+- In Memory Cache
+- Redis Cache
+- s3 Bucket Cache
+
+## Quick Start - Redis, s3 Cache
+
+
+
+
Caching can be enabled by adding the `cache` key in the `config.yaml`
+
### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
@@ -40,8 +53,45 @@ REDIS_ = ""
```shell
$ litellm --config /path/to/config.yaml
```
+
+
+
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+ litellm_params:
+ model: gpt-3.5-turbo
+ - model_name: text-embedding-ada-002
+ litellm_params:
+ model: text-embedding-ada-002
+
+litellm_settings:
+ set_verbose: True
+ cache: True # set cache responses to True
+ cache_params: # set cache params for s3
+ type: s3
+ s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
+ s3_region_name: us-west-2 # AWS Region Name for S3
+ s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
+ s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
+ s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
+```
+
+### Step 2: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+
+
## Using Caching - /chat/completions
+
+
+
+
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
@@ -60,8 +110,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
```
+
+
-## Using Caching - /embeddings
Send the same request twice:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
@@ -78,6 +129,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
"input": ["write a litellm poem"]
}'
```
+
+
## Advanced
### Set Cache Params on config.yaml
@@ -103,78 +156,121 @@ litellm_settings:
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
```
-### Cache-Controls on requests
+### Turn on / off caching per request.
-Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`.
+The proxy support 3 cache-controls:
-Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
+- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
-```javascript
-const { OpenAI } = require('openai');
+[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
-const openai = new OpenAI({
- apiKey: "sk-1234", // This is the default and can be omitted
- baseURL: "http://0.0.0.0:8000"
-});
+**Turn off caching**
-async function main() {
- const chatCompletion = await openai.chat.completions.create({
- messages: [{ role: 'user', content: 'Say this is a test' }],
- model: 'gpt-3.5-turbo',
- }, {"headers": {
- "Cache-Control": "s-maxage=0" // 👈 sets ttl=0
- }});
-}
+```python
+import os
+from openai import OpenAI
-main();
+client = OpenAI(
+ # This is the default and can be omitted
+ api_key=os.environ.get("OPENAI_API_KEY"),
+ base_url="http://0.0.0.0:8000"
+)
+
+chat_completion = client.chat.completions.create(
+ messages=[
+ {
+ "role": "user",
+ "content": "Say this is a test",
+ }
+ ],
+ model="gpt-3.5-turbo",
+ cache={
+ "no-cache": True # will not return a cached response
+ }
+)
```
-### Override caching per `chat/completions` request
-Caching can be switched on/off per `/chat/completions` request
-- Caching **on** for individual completion - pass `caching=True`:
- ```shell
- curl http://0.0.0.0:8000/v1/chat/completions \
- -H "Content-Type: application/json" \
- -d '{
- "model": "gpt-3.5-turbo",
- "messages": [{"role": "user", "content": "write a poem about litellm!"}],
- "temperature": 0.7,
- "caching": true
- }'
- ```
-- Caching **off** for individual completion - pass `caching=False`:
- ```shell
- curl http://0.0.0.0:8000/v1/chat/completions \
- -H "Content-Type: application/json" \
- -d '{
- "model": "gpt-3.5-turbo",
- "messages": [{"role": "user", "content": "write a poem about litellm!"}],
- "temperature": 0.7,
- "caching": false
- }'
- ```
+**Turn on caching**
+```python
+import os
+from openai import OpenAI
-### Override caching per `/embeddings` request
+client = OpenAI(
+ # This is the default and can be omitted
+ api_key=os.environ.get("OPENAI_API_KEY"),
+ base_url="http://0.0.0.0:8000"
+)
-Caching can be switched on/off per `/embeddings` request
-- Caching **on** for embedding - pass `caching=True`:
- ```shell
- curl --location 'http://0.0.0.0:8000/embeddings' \
- --header 'Content-Type: application/json' \
- --data ' {
- "model": "text-embedding-ada-002",
- "input": ["write a litellm poem"],
- "caching": true
- }'
- ```
-- Caching **off** for completion - pass `caching=False`:
- ```shell
- curl --location 'http://0.0.0.0:8000/embeddings' \
- --header 'Content-Type: application/json' \
- --data ' {
- "model": "text-embedding-ada-002",
- "input": ["write a litellm poem"],
- "caching": false
- }'
- ```
\ No newline at end of file
+chat_completion = client.chat.completions.create(
+ messages=[
+ {
+ "role": "user",
+ "content": "Say this is a test",
+ }
+ ],
+ model="gpt-3.5-turbo",
+ cache={
+ "ttl": 600 # caches response for 10 minutes
+ }
+)
+```
+
+```python
+import os
+from openai import OpenAI
+
+client = OpenAI(
+ # This is the default and can be omitted
+ api_key=os.environ.get("OPENAI_API_KEY"),
+ base_url="http://0.0.0.0:8000"
+)
+
+chat_completion = client.chat.completions.create(
+ messages=[
+ {
+ "role": "user",
+ "content": "Say this is a test",
+ }
+ ],
+ model="gpt-3.5-turbo",
+ cache={
+ "s-maxage": 600 # only get responses cached within last 10 minutes
+ }
+)
+```
+
+## Supported `cache_params`
+
+```yaml
+cache_params:
+ # Type of cache (options: "local", "redis", "s3")
+ type: s3
+
+ # List of litellm call types to cache for
+ # Options: "completion", "acompletion", "embedding", "aembedding"
+ supported_call_types:
+ - completion
+ - acompletion
+ - embedding
+ - aembedding
+
+ # Redis cache parameters
+ host: localhost # Redis server hostname or IP address
+ port: "6379" # Redis server port (as a string)
+ password: secret_password # Redis server password
+
+ # S3 cache parameters
+ s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket
+ s3_region_name: us-west-2 # AWS region of the S3 bucket
+ s3_api_version: 2006-03-01 # AWS S3 API version
+ s3_use_ssl: true # Use SSL for S3 connections (options: true, false)
+ s3_verify: true # SSL certificate verification for S3 connections (options: true, false)
+ s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL
+ s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
+ s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
+ s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials
+
+```
diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index b6db7308b6..9b18340278 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -251,7 +251,7 @@ s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for h
1. Install Proxy dependencies
```bash
-$ pip install litellm[proxy] litellm[extra_proxy]
+$ pip install 'litellm[proxy]' 'litellm[extra_proxy]'
```
2. Save Azure details in your environment
diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md
index 92a8b2ec76..f8ff95004c 100644
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@@ -19,12 +19,6 @@ LiteLLM Server manages:
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
-```shell
-$ pip install litellm[proxy]
-```
-
-If this fails try running
-
```shell
$ pip install 'litellm[proxy]'
```
@@ -190,6 +184,13 @@ $ export OPENAI_API_KEY=my-api-key
```shell
$ litellm --model gpt-3.5-turbo
```
+
+
+
+```
+$ litellm --model ollama/
+```
+
diff --git a/docs/my-website/docs/proxy/rules.md b/docs/my-website/docs/proxy/rules.md
new file mode 100644
index 0000000000..1e963577fb
--- /dev/null
+++ b/docs/my-website/docs/proxy/rules.md
@@ -0,0 +1,43 @@
+# Post-Call Rules
+
+Use this to fail a request based on the output of an llm api call.
+
+## Quick Start
+
+### Step 1: Create a file (e.g. post_call_rules.py)
+
+```python
+def my_custom_rule(input): # receives the model response
+ if len(input) < 5: # trigger fallback if the model response is too short
+ return False
+ return True
+```
+
+### Step 2. Point it to your proxy
+
+```python
+litellm_settings:
+ post_call_rules: post_call_rules.my_custom_rule
+ num_retries: 3
+```
+
+### Step 3. Start + test your proxy
+
+```bash
+$ litellm /path/to/config.yaml
+```
+
+```bash
+curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+ "model": "deepseek-coder",
+ "messages": [{"role":"user","content":"What llm are you?"}],
+ "temperature": 0.7,
+ "max_tokens": 10,
+}'
+```
+---
+
+This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
\ No newline at end of file
diff --git a/docs/my-website/docs/proxy_server.md b/docs/my-website/docs/proxy_server.md
index 200a92b95c..9c335f2a24 100644
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@@ -13,7 +13,7 @@ Docs outdated. New docs 👉 [here](./simple_proxy)
## Usage
```shell
-pip install litellm[proxy]
+pip install 'litellm[proxy]'
```
```shell
$ litellm --model ollama/codellama
diff --git a/docs/my-website/docs/secret.md b/docs/my-website/docs/secret.md
index 7234a85440..2f147af555 100644
--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@@ -40,7 +40,7 @@ litellm.get_secret("your-test-key")
1. Install Proxy dependencies
```bash
-pip install litellm[proxy] litellm[extra_proxy]
+pip install 'litellm[proxy]' 'litellm[extra_proxy]'
```
2. Save Azure details in your environment
diff --git a/docs/my-website/docs/simple_proxy_old_doc.md b/docs/my-website/docs/simple_proxy_old_doc.md
index 6b65037ffa..6bff69eb3e 100644
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@@ -16,7 +16,7 @@ LiteLLM Server manages:
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
```shell
-$ pip install litellm[proxy]
+$ pip install 'litellm[proxy]'
```
```shell
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index b65015906e..12ea591447 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -112,6 +112,7 @@ const sidebars = {
"proxy/reliability",
"proxy/health",
"proxy/call_hooks",
+ "proxy/rules",
"proxy/caching",
"proxy/alerting",
"proxy/logging",
@@ -167,20 +168,7 @@ const sidebars = {
`observability/telemetry`,
],
},
- {
- type: "category",
- label: "Caching",
- link: {
- type: 'generated-index',
- title: 'Providers',
- description: 'Learn how to deploy + call models from different providers on LiteLLM',
- slug: '/caching',
- },
- items: [
- "caching/local_caching",
- "caching/redis_cache",
- ],
- },
+ "caching/redis_cache",
{
type: "category",
label: "LangChain, LlamaIndex Integration",
diff --git a/docs/my-website/src/pages/index.md b/docs/my-website/src/pages/index.md
index 425266219c..1b13b9b2ba 100644
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@@ -375,6 +375,45 @@ response = completion(
Need a dedicated key? Email us @ krrish@berri.ai
+## OpenAI Proxy
+
+Track spend across multiple projects/people
+
+The proxy provides:
+1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
+2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
+3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
+4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
+
+### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
+
+### Quick Start Proxy - CLI
+
+```shell
+pip install 'litellm[proxy]'
+```
+
+#### Step 1: Start litellm proxy
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+
+#INFO: Proxy running on http://0.0.0.0:8000
+```
+
+#### Step 2: Make ChatCompletions Request to Proxy
+```python
+import openai # openai v1.0.0+
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+ {
+ "role": "user",
+ "content": "this is a test request, write a short poem"
+ }
+])
+
+print(response)
+```
## More details
* [exception mapping](./exception_mapping.md)
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 6157834db5..09af665a6b 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -111,6 +111,13 @@
"litellm_provider": "openai",
"mode": "embedding"
},
+ "text-embedding-ada-002-v2": {
+ "max_tokens": 8191,
+ "input_cost_per_token": 0.0000001,
+ "output_cost_per_token": 0.000000,
+ "litellm_provider": "openai",
+ "mode": "embedding"
+ },
"256-x-256/dall-e-2": {
"mode": "image_generation",
"input_cost_per_pixel": 0.00000024414,
@@ -242,6 +249,13 @@
"litellm_provider": "azure",
"mode": "chat"
},
+ "azure/ada": {
+ "max_tokens": 8191,
+ "input_cost_per_token": 0.0000001,
+ "output_cost_per_token": 0.000000,
+ "litellm_provider": "azure",
+ "mode": "embedding"
+ },
"azure/text-embedding-ada-002": {
"max_tokens": 8191,
"input_cost_per_token": 0.0000001,
@@ -1630,4 +1644,4 @@
"mode": "embedding"
}
-}
+}
\ No newline at end of file