From 9bb2c131197bb87e8408a7cc857278931bbbf910 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Sat, 6 Jan 2024 15:01:23 +0530 Subject: [PATCH] (ci/cd) add all docs changes to this branch --- docs/my-website/docs/caching/redis_cache.md | 142 +++++++++--- docs/my-website/docs/index.md | 43 +++- docs/my-website/docs/proxy/alerting.md | 1 + docs/my-website/docs/proxy/caching.md | 228 +++++++++++++------ docs/my-website/docs/proxy/configs.md | 2 +- docs/my-website/docs/proxy/quick_start.md | 13 +- docs/my-website/docs/proxy/rules.md | 43 ++++ docs/my-website/docs/proxy_server.md | 2 +- docs/my-website/docs/secret.md | 2 +- docs/my-website/docs/simple_proxy_old_doc.md | 2 +- docs/my-website/sidebars.js | 16 +- docs/my-website/src/pages/index.md | 39 ++++ model_prices_and_context_window.json | 16 +- 13 files changed, 427 insertions(+), 122 deletions(-) create mode 100644 docs/my-website/docs/proxy/rules.md diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md index fc799bcc39..3d70c5e3db 100644 --- a/docs/my-website/docs/caching/redis_cache.md +++ b/docs/my-website/docs/caching/redis_cache.md @@ -1,18 +1,29 @@ -# Redis Cache +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -[**See Code**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/caching.py#L71) +# Caching - In-Memory, Redis, s3 + +[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py) + +## Initialize Cache - In Memory, Redis, s3 Bucket + + + + + -### Pre-requisites Install redis ```shell pip install redis ``` + For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ ### Quick Start ```python import litellm from litellm import completion from litellm.caching import Cache + litellm.cache = Cache(type="redis", host=, port=, password=) # Make completion calls @@ -28,6 +39,78 @@ response2 = completion( # response1 == response2, response 1 is cached ``` + + + + + +Install boto3 +```shell +pip install boto3 +``` + +Set AWS environment variables + +```shell +AWS_ACCESS_KEY_ID = "AKI*******" +AWS_SECRET_ACCESS_KEY = "WOl*****" +``` +### Quick Start +```python +import litellm +from litellm import completion +from litellm.caching import Cache + +# pass s3-bucket name +litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2") + +# Make completion calls +response1 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}] +) +response2 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}] +) + +# response1 == response2, response 1 is cached +``` + + + + + + +### Quick Start + +```python +import litellm +from litellm import completion +from litellm.caching import Cache +litellm.cache = Cache() + +# Make completion calls +response1 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}] + caching=True +) +response2 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + caching=True +) + +# response1 == response2, response 1 is cached + +``` + + + + + + ## Cache Context Manager - Enable, Disable, Update Cache Use the context manager for easily enabling, disabling & updating the litellm cache @@ -103,35 +186,34 @@ litellm.cache = cache # set litellm.cache to your cache ## Cache Initialization Parameters -#### `type` (str, optional) +```python +def __init__( + self, + type: Optional[Literal["local", "redis", "s3"]] = "local", + supported_call_types: Optional[ + List[Literal["completion", "acompletion", "embedding", "aembedding"]] + ] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types. + + # redis cache params + host: Optional[str] = None, + port: Optional[str] = None, + password: Optional[str] = None, -The type of cache to initialize. It can be either "local" or "redis". Defaults to "local". - -#### `host` (str, optional) - -The host address for the Redis cache. This parameter is required if the `type` is set to "redis". - -#### `port` (int, optional) - -The port number for the Redis cache. This parameter is required if the `type` is set to "redis". - -#### `password` (str, optional) - -The password for the Redis cache. This parameter is required if the `type` is set to "redis". - -#### `supported_call_types` (list, optional) - -A list of call types to cache for. Defaults to caching for all call types. The available call types are: - -- "completion" -- "acompletion" -- "embedding" -- "aembedding" - -#### `**kwargs` (additional keyword arguments) - -Additional keyword arguments are accepted for the initialization of the Redis cache using the `redis.Redis()` constructor. These arguments allow you to fine-tune the Redis cache configuration based on your specific needs. + # s3 Bucket, boto3 configuration + s3_bucket_name: Optional[str] = None, + s3_region_name: Optional[str] = None, + s3_api_version: Optional[str] = None, + s3_use_ssl: Optional[bool] = True, + s3_verify: Optional[Union[bool, str]] = None, + s3_endpoint_url: Optional[str] = None, + s3_aws_access_key_id: Optional[str] = None, + s3_aws_secret_access_key: Optional[str] = None, + s3_aws_session_token: Optional[str] = None, + s3_config: Optional[Any] = None, + **kwargs, +): +``` ## Logging diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md index f2329be1e6..f6bd7bc42b 100644 --- a/docs/my-website/docs/index.md +++ b/docs/my-website/docs/index.md @@ -396,7 +396,48 @@ response = completion( ) ``` +## OpenAI Proxy + +Track spend across multiple projects/people + +The proxy provides: +1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth) +2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class) +3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend) +4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits) + +### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/) + +### Quick Start Proxy - CLI + +```shell +pip install 'litellm[proxy]' +``` + +#### Step 1: Start litellm proxy +```shell +$ litellm --model huggingface/bigcode/starcoder + +#INFO: Proxy running on http://0.0.0.0:8000 +``` + +#### Step 2: Make ChatCompletions Request to Proxy +```python +import openai # openai v1.0.0+ +client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +]) + +print(response) +``` + + ## More details * [exception mapping](./exception_mapping.md) * [retries + model fallbacks for completion()](./completion/reliable_completions.md) -* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md) \ No newline at end of file +* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md) diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md index a4a2ab6c8e..699f6d6ffa 100644 --- a/docs/my-website/docs/proxy/alerting.md +++ b/docs/my-website/docs/proxy/alerting.md @@ -24,6 +24,7 @@ model_list: general_settings: alerting: ["slack"] + alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ environment_variables: SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>" diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index bb8399f1e6..9132854e91 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -1,8 +1,21 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Caching Cache LLM Responses -## Quick Start +LiteLLM supports: +- In Memory Cache +- Redis Cache +- s3 Bucket Cache + +## Quick Start - Redis, s3 Cache + + + + Caching can be enabled by adding the `cache` key in the `config.yaml` + ### Step 1: Add `cache` to the config.yaml ```yaml model_list: @@ -40,8 +53,45 @@ REDIS_ = "" ```shell $ litellm --config /path/to/config.yaml ``` + + + + +### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: text-embedding-ada-002 + litellm_params: + model: text-embedding-ada-002 + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True + cache_params: # set cache params for s3 + type: s3 + s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3 + s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3 + s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets +``` + +### Step 2: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + + + ## Using Caching - /chat/completions + + + + Send the same request twice: ```shell curl http://0.0.0.0:8000/v1/chat/completions \ @@ -60,8 +110,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \ "temperature": 0.7 }' ``` + + -## Using Caching - /embeddings Send the same request twice: ```shell curl --location 'http://0.0.0.0:8000/embeddings' \ @@ -78,6 +129,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \ "input": ["write a litellm poem"] }' ``` + + ## Advanced ### Set Cache Params on config.yaml @@ -103,78 +156,121 @@ litellm_settings: supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types ``` -### Cache-Controls on requests +### Turn on / off caching per request. -Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`. +The proxy support 3 cache-controls: -Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218 +- `ttl`: Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds). +- `no-cache`: Will not return a cached response, but instead call the actual endpoint. -```javascript -const { OpenAI } = require('openai'); +[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) -const openai = new OpenAI({ - apiKey: "sk-1234", // This is the default and can be omitted - baseURL: "http://0.0.0.0:8000" -}); +**Turn off caching** -async function main() { - const chatCompletion = await openai.chat.completions.create({ - messages: [{ role: 'user', content: 'Say this is a test' }], - model: 'gpt-3.5-turbo', - }, {"headers": { - "Cache-Control": "s-maxage=0" // 👈 sets ttl=0 - }}); -} +```python +import os +from openai import OpenAI -main(); +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) + +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "no-cache": True # will not return a cached response + } +) ``` -### Override caching per `chat/completions` request -Caching can be switched on/off per `/chat/completions` request -- Caching **on** for individual completion - pass `caching=True`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": true - }' - ``` -- Caching **off** for individual completion - pass `caching=False`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": false - }' - ``` +**Turn on caching** +```python +import os +from openai import OpenAI -### Override caching per `/embeddings` request +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) -Caching can be switched on/off per `/embeddings` request -- Caching **on** for embedding - pass `caching=True`: - ```shell - curl --location 'http://0.0.0.0:8000/embeddings' \ - --header 'Content-Type: application/json' \ - --data ' { - "model": "text-embedding-ada-002", - "input": ["write a litellm poem"], - "caching": true - }' - ``` -- Caching **off** for completion - pass `caching=False`: - ```shell - curl --location 'http://0.0.0.0:8000/embeddings' \ - --header 'Content-Type: application/json' \ - --data ' { - "model": "text-embedding-ada-002", - "input": ["write a litellm poem"], - "caching": false - }' - ``` \ No newline at end of file +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "ttl": 600 # caches response for 10 minutes + } +) +``` + +```python +import os +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://0.0.0.0:8000" +) + +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + cache={ + "s-maxage": 600 # only get responses cached within last 10 minutes + } +) +``` + +## Supported `cache_params` + +```yaml +cache_params: + # Type of cache (options: "local", "redis", "s3") + type: s3 + + # List of litellm call types to cache for + # Options: "completion", "acompletion", "embedding", "aembedding" + supported_call_types: + - completion + - acompletion + - embedding + - aembedding + + # Redis cache parameters + host: localhost # Redis server hostname or IP address + port: "6379" # Redis server port (as a string) + password: secret_password # Redis server password + + # S3 cache parameters + s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket + s3_region_name: us-west-2 # AWS region of the S3 bucket + s3_api_version: 2006-03-01 # AWS S3 API version + s3_use_ssl: true # Use SSL for S3 connections (options: true, false) + s3_verify: true # SSL certificate verification for S3 connections (options: true, false) + s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL + s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3 + s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3 + s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials + +``` diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index b6db7308b6..9b18340278 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -251,7 +251,7 @@ s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for h 1. Install Proxy dependencies ```bash -$ pip install litellm[proxy] litellm[extra_proxy] +$ pip install 'litellm[proxy]' 'litellm[extra_proxy]' ``` 2. Save Azure details in your environment diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md index 92a8b2ec76..f8ff95004c 100644 --- a/docs/my-website/docs/proxy/quick_start.md +++ b/docs/my-website/docs/proxy/quick_start.md @@ -19,12 +19,6 @@ LiteLLM Server manages: View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments) -```shell -$ pip install litellm[proxy] -``` - -If this fails try running - ```shell $ pip install 'litellm[proxy]' ``` @@ -190,6 +184,13 @@ $ export OPENAI_API_KEY=my-api-key ```shell $ litellm --model gpt-3.5-turbo ``` + + + +``` +$ litellm --model ollama/ +``` + diff --git a/docs/my-website/docs/proxy/rules.md b/docs/my-website/docs/proxy/rules.md new file mode 100644 index 0000000000..1e963577fb --- /dev/null +++ b/docs/my-website/docs/proxy/rules.md @@ -0,0 +1,43 @@ +# Post-Call Rules + +Use this to fail a request based on the output of an llm api call. + +## Quick Start + +### Step 1: Create a file (e.g. post_call_rules.py) + +```python +def my_custom_rule(input): # receives the model response + if len(input) < 5: # trigger fallback if the model response is too short + return False + return True +``` + +### Step 2. Point it to your proxy + +```python +litellm_settings: + post_call_rules: post_call_rules.my_custom_rule + num_retries: 3 +``` + +### Step 3. Start + test your proxy + +```bash +$ litellm /path/to/config.yaml +``` + +```bash +curl --location 'http://0.0.0.0:8000/v1/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer sk-1234' \ +--data '{ + "model": "deepseek-coder", + "messages": [{"role":"user","content":"What llm are you?"}], + "temperature": 0.7, + "max_tokens": 10, +}' +``` +--- + +This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing. \ No newline at end of file diff --git a/docs/my-website/docs/proxy_server.md b/docs/my-website/docs/proxy_server.md index 200a92b95c..9c335f2a24 100644 --- a/docs/my-website/docs/proxy_server.md +++ b/docs/my-website/docs/proxy_server.md @@ -13,7 +13,7 @@ Docs outdated. New docs 👉 [here](./simple_proxy) ## Usage ```shell -pip install litellm[proxy] +pip install 'litellm[proxy]' ``` ```shell $ litellm --model ollama/codellama diff --git a/docs/my-website/docs/secret.md b/docs/my-website/docs/secret.md index 7234a85440..2f147af555 100644 --- a/docs/my-website/docs/secret.md +++ b/docs/my-website/docs/secret.md @@ -40,7 +40,7 @@ litellm.get_secret("your-test-key") 1. Install Proxy dependencies ```bash -pip install litellm[proxy] litellm[extra_proxy] +pip install 'litellm[proxy]' 'litellm[extra_proxy]' ``` 2. Save Azure details in your environment diff --git a/docs/my-website/docs/simple_proxy_old_doc.md b/docs/my-website/docs/simple_proxy_old_doc.md index 6b65037ffa..6bff69eb3e 100644 --- a/docs/my-website/docs/simple_proxy_old_doc.md +++ b/docs/my-website/docs/simple_proxy_old_doc.md @@ -16,7 +16,7 @@ LiteLLM Server manages: View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments) ```shell -$ pip install litellm[proxy] +$ pip install 'litellm[proxy]' ``` ```shell diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index b65015906e..12ea591447 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -112,6 +112,7 @@ const sidebars = { "proxy/reliability", "proxy/health", "proxy/call_hooks", + "proxy/rules", "proxy/caching", "proxy/alerting", "proxy/logging", @@ -167,20 +168,7 @@ const sidebars = { `observability/telemetry`, ], }, - { - type: "category", - label: "Caching", - link: { - type: 'generated-index', - title: 'Providers', - description: 'Learn how to deploy + call models from different providers on LiteLLM', - slug: '/caching', - }, - items: [ - "caching/local_caching", - "caching/redis_cache", - ], - }, + "caching/redis_cache", { type: "category", label: "LangChain, LlamaIndex Integration", diff --git a/docs/my-website/src/pages/index.md b/docs/my-website/src/pages/index.md index 425266219c..1b13b9b2ba 100644 --- a/docs/my-website/src/pages/index.md +++ b/docs/my-website/src/pages/index.md @@ -375,6 +375,45 @@ response = completion( Need a dedicated key? Email us @ krrish@berri.ai +## OpenAI Proxy + +Track spend across multiple projects/people + +The proxy provides: +1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth) +2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class) +3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend) +4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits) + +### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/) + +### Quick Start Proxy - CLI + +```shell +pip install 'litellm[proxy]' +``` + +#### Step 1: Start litellm proxy +```shell +$ litellm --model huggingface/bigcode/starcoder + +#INFO: Proxy running on http://0.0.0.0:8000 +``` + +#### Step 2: Make ChatCompletions Request to Proxy +```python +import openai # openai v1.0.0+ +client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +]) + +print(response) +``` ## More details * [exception mapping](./exception_mapping.md) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 6157834db5..09af665a6b 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -111,6 +111,13 @@ "litellm_provider": "openai", "mode": "embedding" }, + "text-embedding-ada-002-v2": { + "max_tokens": 8191, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.000000, + "litellm_provider": "openai", + "mode": "embedding" + }, "256-x-256/dall-e-2": { "mode": "image_generation", "input_cost_per_pixel": 0.00000024414, @@ -242,6 +249,13 @@ "litellm_provider": "azure", "mode": "chat" }, + "azure/ada": { + "max_tokens": 8191, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.000000, + "litellm_provider": "azure", + "mode": "embedding" + }, "azure/text-embedding-ada-002": { "max_tokens": 8191, "input_cost_per_token": 0.0000001, @@ -1630,4 +1644,4 @@ "mode": "embedding" } -} +} \ No newline at end of file