mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 10:14:26 +00:00
(ci/cd) add all docs changes to this branch
This commit is contained in:
parent
cd98d256b5
commit
9bb2c13119
13 changed files with 427 additions and 122 deletions
|
@ -1,18 +1,29 @@
|
|||
# Redis Cache
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/caching.py#L71)
|
||||
# Caching - In-Memory, Redis, s3
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
|
||||
|
||||
## Initialize Cache - In Memory, Redis, s3 Bucket
|
||||
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="redis" label="redis-cache">
|
||||
|
||||
### Pre-requisites
|
||||
Install redis
|
||||
```shell
|
||||
pip install redis
|
||||
```
|
||||
|
||||
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||
### Quick Start
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
from litellm.caching import Cache
|
||||
|
||||
litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)
|
||||
|
||||
# Make completion calls
|
||||
|
@ -28,6 +39,78 @@ response2 = completion(
|
|||
# response1 == response2, response 1 is cached
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="s3" label="s3-cache">
|
||||
|
||||
Install boto3
|
||||
```shell
|
||||
pip install boto3
|
||||
```
|
||||
|
||||
Set AWS environment variables
|
||||
|
||||
```shell
|
||||
AWS_ACCESS_KEY_ID = "AKI*******"
|
||||
AWS_SECRET_ACCESS_KEY = "WOl*****"
|
||||
```
|
||||
### Quick Start
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
from litellm.caching import Cache
|
||||
|
||||
# pass s3-bucket name
|
||||
litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
|
||||
|
||||
# Make completion calls
|
||||
response1 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}]
|
||||
)
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}]
|
||||
)
|
||||
|
||||
# response1 == response2, response 1 is cached
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="in-mem" label="in memory cache">
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
from litellm.caching import Cache
|
||||
litellm.cache = Cache()
|
||||
|
||||
# Make completion calls
|
||||
response1 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}]
|
||||
caching=True
|
||||
)
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}],
|
||||
caching=True
|
||||
)
|
||||
|
||||
# response1 == response2, response 1 is cached
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
</Tabs>
|
||||
|
||||
## Cache Context Manager - Enable, Disable, Update Cache
|
||||
Use the context manager for easily enabling, disabling & updating the litellm cache
|
||||
|
||||
|
@ -103,35 +186,34 @@ litellm.cache = cache # set litellm.cache to your cache
|
|||
|
||||
## Cache Initialization Parameters
|
||||
|
||||
#### `type` (str, optional)
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
type: Optional[Literal["local", "redis", "s3"]] = "local",
|
||||
supported_call_types: Optional[
|
||||
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
|
||||
] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
|
||||
|
||||
# redis cache params
|
||||
host: Optional[str] = None,
|
||||
port: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
|
||||
The type of cache to initialize. It can be either "local" or "redis". Defaults to "local".
|
||||
|
||||
#### `host` (str, optional)
|
||||
|
||||
The host address for the Redis cache. This parameter is required if the `type` is set to "redis".
|
||||
|
||||
#### `port` (int, optional)
|
||||
|
||||
The port number for the Redis cache. This parameter is required if the `type` is set to "redis".
|
||||
|
||||
#### `password` (str, optional)
|
||||
|
||||
The password for the Redis cache. This parameter is required if the `type` is set to "redis".
|
||||
|
||||
#### `supported_call_types` (list, optional)
|
||||
|
||||
A list of call types to cache for. Defaults to caching for all call types. The available call types are:
|
||||
|
||||
- "completion"
|
||||
- "acompletion"
|
||||
- "embedding"
|
||||
- "aembedding"
|
||||
|
||||
#### `**kwargs` (additional keyword arguments)
|
||||
|
||||
Additional keyword arguments are accepted for the initialization of the Redis cache using the `redis.Redis()` constructor. These arguments allow you to fine-tune the Redis cache configuration based on your specific needs.
|
||||
|
||||
# s3 Bucket, boto3 configuration
|
||||
s3_bucket_name: Optional[str] = None,
|
||||
s3_region_name: Optional[str] = None,
|
||||
s3_api_version: Optional[str] = None,
|
||||
s3_use_ssl: Optional[bool] = True,
|
||||
s3_verify: Optional[Union[bool, str]] = None,
|
||||
s3_endpoint_url: Optional[str] = None,
|
||||
s3_aws_access_key_id: Optional[str] = None,
|
||||
s3_aws_secret_access_key: Optional[str] = None,
|
||||
s3_aws_session_token: Optional[str] = None,
|
||||
s3_config: Optional[Any] = None,
|
||||
**kwargs,
|
||||
):
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
|
|
|
@ -396,7 +396,48 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
## OpenAI Proxy
|
||||
|
||||
Track spend across multiple projects/people
|
||||
|
||||
The proxy provides:
|
||||
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
|
||||
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
|
||||
3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
|
||||
4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
|
||||
|
||||
### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
|
||||
|
||||
### Quick Start Proxy - CLI
|
||||
|
||||
```shell
|
||||
pip install 'litellm[proxy]'
|
||||
```
|
||||
|
||||
#### Step 1: Start litellm proxy
|
||||
```shell
|
||||
$ litellm --model huggingface/bigcode/starcoder
|
||||
|
||||
#INFO: Proxy running on http://0.0.0.0:8000
|
||||
```
|
||||
|
||||
#### Step 2: Make ChatCompletions Request to Proxy
|
||||
```python
|
||||
import openai # openai v1.0.0+
|
||||
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## More details
|
||||
* [exception mapping](./exception_mapping.md)
|
||||
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
|
||||
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
|
||||
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
|
||||
|
|
|
@ -24,6 +24,7 @@ model_list:
|
|||
|
||||
general_settings:
|
||||
alerting: ["slack"]
|
||||
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
|
||||
|
||||
environment_variables:
|
||||
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
|
||||
|
|
|
@ -1,8 +1,21 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Caching
|
||||
Cache LLM Responses
|
||||
|
||||
## Quick Start
|
||||
LiteLLM supports:
|
||||
- In Memory Cache
|
||||
- Redis Cache
|
||||
- s3 Bucket Cache
|
||||
|
||||
## Quick Start - Redis, s3 Cache
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="redis" label="redis cache">
|
||||
|
||||
Caching can be enabled by adding the `cache` key in the `config.yaml`
|
||||
|
||||
### Step 1: Add `cache` to the config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
|
@ -40,8 +53,45 @@ REDIS_<redis-kwarg-name> = ""
|
|||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="s3" label="s3 cache">
|
||||
|
||||
### Step 1: Add `cache` to the config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
- model_name: text-embedding-ada-002
|
||||
litellm_params:
|
||||
model: text-embedding-ada-002
|
||||
|
||||
litellm_settings:
|
||||
set_verbose: True
|
||||
cache: True # set cache responses to True
|
||||
cache_params: # set cache params for s3
|
||||
type: s3
|
||||
s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
|
||||
s3_region_name: us-west-2 # AWS Region Name for S3
|
||||
s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
|
||||
s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
|
||||
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
|
||||
```
|
||||
|
||||
### Step 2: Run proxy with config
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Using Caching - /chat/completions
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="chat_completions" label="/chat/completions">
|
||||
|
||||
Send the same request twice:
|
||||
```shell
|
||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||
|
@ -60,8 +110,9 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
|||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="embeddings" label="/embeddings">
|
||||
|
||||
## Using Caching - /embeddings
|
||||
Send the same request twice:
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||
|
@ -78,6 +129,8 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
|
|||
"input": ["write a litellm poem"]
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Advanced
|
||||
### Set Cache Params on config.yaml
|
||||
|
@ -103,78 +156,121 @@ litellm_settings:
|
|||
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
|
||||
```
|
||||
|
||||
### Cache-Controls on requests
|
||||
### Turn on / off caching per request.
|
||||
|
||||
Set ttl per request by passing Cache-Controls. The proxy currently supports just `s-maxage`.
|
||||
The proxy support 3 cache-controls:
|
||||
|
||||
Comment on this issue if you need additional cache controls - https://github.com/BerriAI/litellm/issues/1218
|
||||
- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
|
||||
- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
|
||||
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
|
||||
|
||||
```javascript
|
||||
const { OpenAI } = require('openai');
|
||||
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: "sk-1234", // This is the default and can be omitted
|
||||
baseURL: "http://0.0.0.0:8000"
|
||||
});
|
||||
**Turn off caching**
|
||||
|
||||
async function main() {
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'gpt-3.5-turbo',
|
||||
}, {"headers": {
|
||||
"Cache-Control": "s-maxage=0" // 👈 sets ttl=0
|
||||
}});
|
||||
}
|
||||
```python
|
||||
import os
|
||||
from openai import OpenAI
|
||||
|
||||
main();
|
||||
client = OpenAI(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say this is a test",
|
||||
}
|
||||
],
|
||||
model="gpt-3.5-turbo",
|
||||
cache={
|
||||
"no-cache": True # will not return a cached response
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### Override caching per `chat/completions` request
|
||||
Caching can be switched on/off per `/chat/completions` request
|
||||
- Caching **on** for individual completion - pass `caching=True`:
|
||||
```shell
|
||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||
"temperature": 0.7,
|
||||
"caching": true
|
||||
}'
|
||||
```
|
||||
- Caching **off** for individual completion - pass `caching=False`:
|
||||
```shell
|
||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||
"temperature": 0.7,
|
||||
"caching": false
|
||||
}'
|
||||
```
|
||||
**Turn on caching**
|
||||
|
||||
```python
|
||||
import os
|
||||
from openai import OpenAI
|
||||
|
||||
### Override caching per `/embeddings` request
|
||||
client = OpenAI(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
Caching can be switched on/off per `/embeddings` request
|
||||
- Caching **on** for embedding - pass `caching=True`:
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "text-embedding-ada-002",
|
||||
"input": ["write a litellm poem"],
|
||||
"caching": true
|
||||
}'
|
||||
```
|
||||
- Caching **off** for completion - pass `caching=False`:
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "text-embedding-ada-002",
|
||||
"input": ["write a litellm poem"],
|
||||
"caching": false
|
||||
}'
|
||||
```
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say this is a test",
|
||||
}
|
||||
],
|
||||
model="gpt-3.5-turbo",
|
||||
cache={
|
||||
"ttl": 600 # caches response for 10 minutes
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
```python
|
||||
import os
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say this is a test",
|
||||
}
|
||||
],
|
||||
model="gpt-3.5-turbo",
|
||||
cache={
|
||||
"s-maxage": 600 # only get responses cached within last 10 minutes
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Supported `cache_params`
|
||||
|
||||
```yaml
|
||||
cache_params:
|
||||
# Type of cache (options: "local", "redis", "s3")
|
||||
type: s3
|
||||
|
||||
# List of litellm call types to cache for
|
||||
# Options: "completion", "acompletion", "embedding", "aembedding"
|
||||
supported_call_types:
|
||||
- completion
|
||||
- acompletion
|
||||
- embedding
|
||||
- aembedding
|
||||
|
||||
# Redis cache parameters
|
||||
host: localhost # Redis server hostname or IP address
|
||||
port: "6379" # Redis server port (as a string)
|
||||
password: secret_password # Redis server password
|
||||
|
||||
# S3 cache parameters
|
||||
s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket
|
||||
s3_region_name: us-west-2 # AWS region of the S3 bucket
|
||||
s3_api_version: 2006-03-01 # AWS S3 API version
|
||||
s3_use_ssl: true # Use SSL for S3 connections (options: true, false)
|
||||
s3_verify: true # SSL certificate verification for S3 connections (options: true, false)
|
||||
s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL
|
||||
s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3
|
||||
s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3
|
||||
s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials
|
||||
|
||||
```
|
||||
|
|
|
@ -251,7 +251,7 @@ s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for h
|
|||
|
||||
1. Install Proxy dependencies
|
||||
```bash
|
||||
$ pip install litellm[proxy] litellm[extra_proxy]
|
||||
$ pip install 'litellm[proxy]' 'litellm[extra_proxy]'
|
||||
```
|
||||
|
||||
2. Save Azure details in your environment
|
||||
|
|
|
@ -19,12 +19,6 @@ LiteLLM Server manages:
|
|||
|
||||
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
|
||||
|
||||
```shell
|
||||
$ pip install litellm[proxy]
|
||||
```
|
||||
|
||||
If this fails try running
|
||||
|
||||
```shell
|
||||
$ pip install 'litellm[proxy]'
|
||||
```
|
||||
|
@ -190,6 +184,13 @@ $ export OPENAI_API_KEY=my-api-key
|
|||
```shell
|
||||
$ litellm --model gpt-3.5-turbo
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="ollama" label="Ollama">
|
||||
|
||||
```
|
||||
$ litellm --model ollama/<ollama-model-name>
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="openai-proxy" label="OpenAI Compatible Endpoint">
|
||||
|
||||
|
|
43
docs/my-website/docs/proxy/rules.md
Normal file
43
docs/my-website/docs/proxy/rules.md
Normal file
|
@ -0,0 +1,43 @@
|
|||
# Post-Call Rules
|
||||
|
||||
Use this to fail a request based on the output of an llm api call.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Step 1: Create a file (e.g. post_call_rules.py)
|
||||
|
||||
```python
|
||||
def my_custom_rule(input): # receives the model response
|
||||
if len(input) < 5: # trigger fallback if the model response is too short
|
||||
return False
|
||||
return True
|
||||
```
|
||||
|
||||
### Step 2. Point it to your proxy
|
||||
|
||||
```python
|
||||
litellm_settings:
|
||||
post_call_rules: post_call_rules.my_custom_rule
|
||||
num_retries: 3
|
||||
```
|
||||
|
||||
### Step 3. Start + test your proxy
|
||||
|
||||
```bash
|
||||
$ litellm /path/to/config.yaml
|
||||
```
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--data '{
|
||||
"model": "deepseek-coder",
|
||||
"messages": [{"role":"user","content":"What llm are you?"}],
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 10,
|
||||
}'
|
||||
```
|
||||
---
|
||||
|
||||
This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
|
|
@ -13,7 +13,7 @@ Docs outdated. New docs 👉 [here](./simple_proxy)
|
|||
|
||||
## Usage
|
||||
```shell
|
||||
pip install litellm[proxy]
|
||||
pip install 'litellm[proxy]'
|
||||
```
|
||||
```shell
|
||||
$ litellm --model ollama/codellama
|
||||
|
|
|
@ -40,7 +40,7 @@ litellm.get_secret("your-test-key")
|
|||
|
||||
1. Install Proxy dependencies
|
||||
```bash
|
||||
pip install litellm[proxy] litellm[extra_proxy]
|
||||
pip install 'litellm[proxy]' 'litellm[extra_proxy]'
|
||||
```
|
||||
|
||||
2. Save Azure details in your environment
|
||||
|
|
|
@ -16,7 +16,7 @@ LiteLLM Server manages:
|
|||
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
|
||||
|
||||
```shell
|
||||
$ pip install litellm[proxy]
|
||||
$ pip install 'litellm[proxy]'
|
||||
```
|
||||
|
||||
```shell
|
||||
|
|
|
@ -112,6 +112,7 @@ const sidebars = {
|
|||
"proxy/reliability",
|
||||
"proxy/health",
|
||||
"proxy/call_hooks",
|
||||
"proxy/rules",
|
||||
"proxy/caching",
|
||||
"proxy/alerting",
|
||||
"proxy/logging",
|
||||
|
@ -167,20 +168,7 @@ const sidebars = {
|
|||
`observability/telemetry`,
|
||||
],
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "Caching",
|
||||
link: {
|
||||
type: 'generated-index',
|
||||
title: 'Providers',
|
||||
description: 'Learn how to deploy + call models from different providers on LiteLLM',
|
||||
slug: '/caching',
|
||||
},
|
||||
items: [
|
||||
"caching/local_caching",
|
||||
"caching/redis_cache",
|
||||
],
|
||||
},
|
||||
"caching/redis_cache",
|
||||
{
|
||||
type: "category",
|
||||
label: "LangChain, LlamaIndex Integration",
|
||||
|
|
|
@ -375,6 +375,45 @@ response = completion(
|
|||
|
||||
Need a dedicated key? Email us @ krrish@berri.ai
|
||||
|
||||
## OpenAI Proxy
|
||||
|
||||
Track spend across multiple projects/people
|
||||
|
||||
The proxy provides:
|
||||
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
|
||||
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
|
||||
3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
|
||||
4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
|
||||
|
||||
### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
|
||||
|
||||
### Quick Start Proxy - CLI
|
||||
|
||||
```shell
|
||||
pip install 'litellm[proxy]'
|
||||
```
|
||||
|
||||
#### Step 1: Start litellm proxy
|
||||
```shell
|
||||
$ litellm --model huggingface/bigcode/starcoder
|
||||
|
||||
#INFO: Proxy running on http://0.0.0.0:8000
|
||||
```
|
||||
|
||||
#### Step 2: Make ChatCompletions Request to Proxy
|
||||
```python
|
||||
import openai # openai v1.0.0+
|
||||
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
## More details
|
||||
* [exception mapping](./exception_mapping.md)
|
||||
|
|
|
@ -111,6 +111,13 @@
|
|||
"litellm_provider": "openai",
|
||||
"mode": "embedding"
|
||||
},
|
||||
"text-embedding-ada-002-v2": {
|
||||
"max_tokens": 8191,
|
||||
"input_cost_per_token": 0.0000001,
|
||||
"output_cost_per_token": 0.000000,
|
||||
"litellm_provider": "openai",
|
||||
"mode": "embedding"
|
||||
},
|
||||
"256-x-256/dall-e-2": {
|
||||
"mode": "image_generation",
|
||||
"input_cost_per_pixel": 0.00000024414,
|
||||
|
@ -242,6 +249,13 @@
|
|||
"litellm_provider": "azure",
|
||||
"mode": "chat"
|
||||
},
|
||||
"azure/ada": {
|
||||
"max_tokens": 8191,
|
||||
"input_cost_per_token": 0.0000001,
|
||||
"output_cost_per_token": 0.000000,
|
||||
"litellm_provider": "azure",
|
||||
"mode": "embedding"
|
||||
},
|
||||
"azure/text-embedding-ada-002": {
|
||||
"max_tokens": 8191,
|
||||
"input_cost_per_token": 0.0000001,
|
||||
|
@ -1630,4 +1644,4 @@
|
|||
"mode": "embedding"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue