(ci/cd) add all docs changes to this branch

2025-04-24 10:14:26 +00:00 · 2024-01-06 15:01:23 +05:30 · 2024-01-06 15:01:23 +05:30 · 9bb2c13119
commit 9bb2c13119
parent cd98d256b5
13 changed files with 427 additions and 122 deletions
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@ -1,18 +1,29 @@
-# Redis Cache
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

-[**See Code**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/caching.py#L71)
+# Caching - In-Memory, Redis, s3
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
+
+## Initialize Cache - In Memory, Redis, s3 Bucket
+
+
+<Tabs>
+
+<TabItem value="redis" label="redis-cache">

-### Pre-requisites
 Install redis
 ```shell
 pip install redis
 ```
+
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
 ### Quick Start
 ```python
 import litellm
 from litellm import completion
 from litellm.caching import Cache
+
 litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)

 # Make completion calls
@ -28,6 +39,78 @@ response2 = completion(
 # response1 == response2, response 1 is cached
 ```

+</TabItem>
+
+
+<TabItem value="s3" label="s3-cache">
+
+Install boto3
+```shell
+pip install boto3
+```
+
+Set AWS environment variables
+
+```shell
+AWS_ACCESS_KEY_ID = "AKI*******"
+AWS_SECRET_ACCESS_KEY = "WOl*****"
+```
+### Quick Start
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+# pass s3-bucket name
+litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
+
+# Make completion calls
+response1 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}]
+)
+response2 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}]
+)
+
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+<TabItem value="in-mem" label="in memory cache">
+
+### Quick Start
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+litellm.cache = Cache()
+
+# Make completion calls
+response1 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}]
+    caching=True
+)
+response2 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}],
+    caching=True
+)
+
+# response1 == response2, response 1 is cached
+
+```
+
+</TabItem>
+
+
+</Tabs>
+
 ## Cache Context Manager - Enable, Disable, Update Cache
 Use the context manager for easily enabling, disabling & updating the litellm cache 

@ -103,35 +186,34 @@ litellm.cache = cache # set litellm.cache to your cache

 ## Cache Initialization Parameters

-#### `type` (str, optional)
+```python
+def __init__(
+    self,
+    type: Optional[Literal["local", "redis", "s3"]] = "local",
+    supported_call_types: Optional[
+        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
+    ] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
+    
+    # redis cache params
+    host: Optional[str] = None,
+    port: Optional[str] = None,
+    password: Optional[str] = None,

-The type of cache to initialize. It can be either "local" or "redis". Defaults to "local".
-
-#### `host` (str, optional)
-
-The host address for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `port` (int, optional)
-
-The port number for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `password` (str, optional)
-
-The password for the Redis cache. This parameter is required if the `type` is set to "redis".
-
-#### `supported_call_types` (list, optional)
-
-A list of call types to cache for. Defaults to caching for all call types. The available call types are:
-
- "completion"
- "acompletion"
- "embedding"
- "aembedding"
-
-#### `**kwargs` (additional keyword arguments)
-
-Additional keyword arguments are accepted for the initialization of the Redis cache using the `redis.Redis()` constructor. These arguments allow you to fine-tune the Redis cache configuration based on your specific needs.

+    # s3 Bucket, boto3 configuration
+    s3_bucket_name: Optional[str] = None,
+    s3_region_name: Optional[str] = None,
+    s3_api_version: Optional[str] = None,
+    s3_use_ssl: Optional[bool] = True,
+    s3_verify: Optional[Union[bool, str]] = None,
+    s3_endpoint_url: Optional[str] = None,
+    s3_aws_access_key_id: Optional[str] = None,
+    s3_aws_secret_access_key: Optional[str] = None,
+    s3_aws_session_token: Optional[str] = None,
+    s3_config: Optional[Any] = None,
+    **kwargs,
+):
+```

 ## Logging 

--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -396,7 +396,48 @@ response = completion(
 )
 ```

+## OpenAI Proxy
+
+Track spend across multiple projects/people 
+
+The proxy provides: 
+1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
+2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
+3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
+4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
+
+### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
+
+### Quick Start Proxy - CLI 
+
+```shell
+pip install 'litellm[proxy]'
+```
+
+#### Step 1: Start litellm proxy
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+
+#INFO: Proxy running on http://0.0.0.0:8000
+```
+
+#### Step 2: Make ChatCompletions Request to Proxy
+```python
+import openai # openai v1.0.0+
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+```
+
+
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
-* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
+* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -24,6 +24,7 @@ model_list:

 general_settings: 
    alerting: ["slack"]
+    alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ 

 environment_variables:
    SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -1,8 +1,21 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Caching 
 Cache LLM Responses

-## Quick Start
+LiteLLM supports:
+- In Memory Cache
+- Redis Cache 
+- s3 Bucket Cache 
+
+## Quick Start - Redis, s3 Cache
+<Tabs>
+
+<TabItem value="redis" label="redis cache">
+
 Caching can be enabled by adding the `cache` key in the `config.yaml`
+
 ### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
@ -40,8 +53,45 @@ REDIS_<redis-kwarg-name> = ""
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
+</TabItem>
+
+<TabItem value="s3" label="s3 cache">
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: text-embedding-ada-002
+    litellm_params:
+      model: text-embedding-ada-002
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True
+  cache_params:        # set cache params for s3
+    type: s3
+    s3_bucket_name: cache-bucket-litellm   # AWS Bucket Name for S3
+    s3_region_name: us-west-2              # AWS Region Name for S3
+    s3_aws_access_key_id: your_access_key  # AWS Access Key ID for S3
+    s3_aws_secret_access_key: your_secret_key  # AWS Secret Access Key for S3
+    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
+```
+
+### Step 2: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
+</Tabs>
+

 ## Using Caching - /chat/completions
+
+<Tabs>
+<TabItem value="chat_completions" label="/chat/completions">
+
 Send the same request twice:
 ```shell
 curl http://0.0.0.0:8000/v1/chat/completions \
@ -60,8 +110,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
     "temperature": 0.7
   }'
 ```
+</TabItem>
+<TabItem value="embeddings" label="/embeddings">

-## Using Caching - /embeddings
 Send the same request twice:
 ```shell
 curl --location 'http://0.0.0.0:8000/embeddings' \
@ -78,6 +129,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
  "input": ["write a litellm poem"]
  }'
 ```
+</TabItem>
+</Tabs>

 ## Advanced
 ### Set Cache Params on config.yaml
@ -103,78 +156,121 @@ litellm_settings:
    supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
 ```

-### Cache-Controls on requests 
+### Turn on / off caching per request.  

-Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`. 
+The proxy support 3 cache-controls:

-Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
+- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 

-```javascript
-const { OpenAI } = require('openai');
+[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)

-const openai = new OpenAI({
-  apiKey: "sk-1234", // This is the default and can be omitted
-  baseURL: "http://0.0.0.0:8000"
-});
+**Turn off caching**

-async function main() {
-  const chatCompletion = await openai.chat.completions.create({
-    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'gpt-3.5-turbo',
-  }, {"headers": {
-    "Cache-Control": "s-maxage=0" // 👈 sets ttl=0
-  }});
-}
+```python
+import os
+from openai import OpenAI

-main();
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"no-cache": True # will not return a cached response 
+		}
+)
 ```

-### Override caching per `chat/completions` request
-Caching can be switched on/off per `/chat/completions` request
- Caching **on** for individual completion - pass `caching=True`:
-  ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "write a poem about litellm!"}],
-     "temperature": 0.7,
-     "caching": true
-   }'
-  ```
- Caching **off** for individual completion - pass `caching=False`:
-  ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "write a poem about litellm!"}],
-     "temperature": 0.7,
-     "caching": false
-   }'
-  ```
+**Turn on caching**

+```python
+import os
+from openai import OpenAI

-### Override caching per `/embeddings` request
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)

-Caching can be switched on/off per `/embeddings` request
- Caching **on** for embedding - pass `caching=True`:
-  ```shell
-  curl --location 'http://0.0.0.0:8000/embeddings' \
-    --header 'Content-Type: application/json' \
-    --data ' {
-    "model": "text-embedding-ada-002",
-    "input": ["write a litellm poem"],
-    "caching": true
-    }'
-  ```
- Caching **off** for completion - pass `caching=False`:
-  ```shell
-    curl --location 'http://0.0.0.0:8000/embeddings' \
-    --header 'Content-Type: application/json' \
-    --data ' {
-    "model": "text-embedding-ada-002",
-    "input": ["write a litellm poem"],
-    "caching": false
-    }'
-  ```
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"ttl": 600 # caches response for 10 minutes 
+		}
+)
+```
+
+```python
+import os
+from openai import OpenAI
+
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+		base_url="http://0.0.0.0:8000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    cache={
+			"s-maxage": 600 # only get responses cached within last 10 minutes 
+		}
+)
+```
+
+## Supported `cache_params`
+
+```yaml
+cache_params:
+  # Type of cache (options: "local", "redis", "s3")
+  type: s3
+
+  # List of litellm call types to cache for
+  # Options: "completion", "acompletion", "embedding", "aembedding"
+  supported_call_types:
+    - completion
+    - acompletion
+    - embedding
+    - aembedding
+
+  # Redis cache parameters
+  host: localhost  # Redis server hostname or IP address
+  port: "6379"  # Redis server port (as a string)
+  password: secret_password  # Redis server password
+
+  # S3 cache parameters
+  s3_bucket_name: your_s3_bucket_name  # Name of the S3 bucket
+  s3_region_name: us-west-2  # AWS region of the S3 bucket
+  s3_api_version: 2006-03-01  # AWS S3 API version
+  s3_use_ssl: true  # Use SSL for S3 connections (options: true, false)
+  s3_verify: true  # SSL certificate verification for S3 connections (options: true, false)
+  s3_endpoint_url: https://s3.amazonaws.com  # S3 endpoint URL
+  s3_aws_access_key_id: your_access_key  # AWS Access Key ID for S3
+  s3_aws_secret_access_key: your_secret_key  # AWS Secret Access Key for S3
+  s3_aws_session_token: your_session_token  # AWS Session Token for temporary credentials
+
+```
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -251,7 +251,7 @@ s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for h

 1. Install Proxy dependencies 
 ```bash
-$ pip install litellm[proxy] litellm[extra_proxy]
+$ pip install 'litellm[proxy]' 'litellm[extra_proxy]'
 ```

 2. Save Azure details in your environment
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@ -19,12 +19,6 @@ LiteLLM Server manages:

 View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)

-```shell
-$ pip install litellm[proxy]
-```
-
-If this fails try running
-
 ```shell
 $ pip install 'litellm[proxy]'
 ```
@ -190,6 +184,13 @@ $ export OPENAI_API_KEY=my-api-key
 ```shell
 $ litellm --model gpt-3.5-turbo
 ```
+</TabItem>
+<TabItem value="ollama" label="Ollama">
+
+```
+$ litellm --model ollama/<ollama-model-name>
+```
+
 </TabItem>
 <TabItem value="openai-proxy" label="OpenAI Compatible Endpoint">

--- a/docs/my-website/docs/proxy/rules.md
+++ b/docs/my-website/docs/proxy/rules.md
@ -0,0 +1,43 @@
+# Post-Call Rules 
+
+Use this to fail a request based on the output of an llm api call.
+
+## Quick Start
+
+### Step 1: Create a file (e.g. post_call_rules.py)
+
+```python
+def my_custom_rule(input): # receives the model response 
+    if len(input) < 5: # trigger fallback if the model response is too short
+         return False 
+    return True 
+```
+
+### Step 2. Point it to your proxy
+
+```python
+litellm_settings:
+  post_call_rules: post_call_rules.my_custom_rule
+  num_retries: 3
+```
+
+### Step 3. Start + test your proxy
+
+```bash
+$ litellm /path/to/config.yaml
+```
+
+```bash
+curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+  "model": "deepseek-coder",
+  "messages": [{"role":"user","content":"What llm are you?"}],
+  "temperature": 0.7,
+  "max_tokens": 10,
+}'
+```
+---
+
+This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -13,7 +13,7 @@ Docs outdated. New docs 👉 [here](./simple_proxy)

 ## Usage 
 ```shell
-pip install litellm[proxy]
+pip install 'litellm[proxy]'
 ```
 ```shell 
 $ litellm --model ollama/codellama 
--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -40,7 +40,7 @@ litellm.get_secret("your-test-key")

 1. Install Proxy dependencies 
 ```bash
-pip install litellm[proxy] litellm[extra_proxy]
+pip install 'litellm[proxy]' 'litellm[extra_proxy]'
 ```

 2. Save Azure details in your environment
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@ -16,7 +16,7 @@ LiteLLM Server manages:
 View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)

 ```shell
-$ pip install litellm[proxy]
+$ pip install 'litellm[proxy]'
 ```

 ```shell
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -112,6 +112,7 @@ const sidebars = {
        "proxy/reliability",
        "proxy/health",
        "proxy/call_hooks",
+        "proxy/rules",
        "proxy/caching",
        "proxy/alerting",
        "proxy/logging", 
@ -167,20 +168,7 @@ const sidebars = {
        `observability/telemetry`,
      ],
    },
-    {
-      type: "category",
-      label: "Caching",
-      link: {
-        type: 'generated-index',
-        title: 'Providers',
-        description: 'Learn how to deploy + call models from different providers on LiteLLM',
-        slug: '/caching',
-      },
-      items: [
-        "caching/local_caching",
-        "caching/redis_cache",
-      ],
-    },
+    "caching/redis_cache",
    {
      type: "category",
      label: "LangChain, LlamaIndex Integration",
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@ -375,6 +375,45 @@ response = completion(

 Need a dedicated key? Email us @ krrish@berri.ai

+## OpenAI Proxy
+
+Track spend across multiple projects/people 
+
+The proxy provides: 
+1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
+2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
+3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
+4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
+
+### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
+
+### Quick Start Proxy - CLI 
+
+```shell
+pip install 'litellm[proxy]'
+```
+
+#### Step 1: Start litellm proxy
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+
+#INFO: Proxy running on http://0.0.0.0:8000
+```
+
+#### Step 2: Make ChatCompletions Request to Proxy
+```python
+import openai # openai v1.0.0+
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+```

 ## More details
 * [exception mapping](./exception_mapping.md)
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -111,6 +111,13 @@
        "litellm_provider": "openai",
        "mode": "embedding"
    },
+    "text-embedding-ada-002-v2": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "openai",
+        "mode": "embedding"
+    },
    "256-x-256/dall-e-2": {
        "mode": "image_generation",
        "input_cost_per_pixel": 0.00000024414,
@ -242,6 +249,13 @@
        "litellm_provider": "azure",
        "mode": "chat"
    },
+    "azure/ada": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "azure",
+        "mode": "embedding"
+    },
    "azure/text-embedding-ada-002": {
        "max_tokens": 8191,
        "input_cost_per_token": 0.0000001,
@ -1630,4 +1644,4 @@
        "mode": "embedding"
    }

-}
+}