mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
Controll fallback prompts client-side (#7334)
* feat(router.py): support passing model-specific messages in fallbacks * docs(routing.md): separate router timeouts into separate doc allow for 1 fallbacks doc (across proxy/router) * docs(routing.md): cleanup router docs * docs(reliability.md): cleanup docs * docs(reliability.md): cleaned up fallback doc just have 1 doc across sdk/proxy simplifies docs * docs(reliability.md): add setting model-specific fallback prompts * fix: fix linting errors * test: skip test causing openai rate limit errros * test: fix test * test: run vertex test first to catch error
This commit is contained in:
parent
495b009a22
commit
e6bdec4eed
12 changed files with 861 additions and 553 deletions
|
@ -1,4 +1,11 @@
|
|||
# Region-based Routing
|
||||
# [DEPRECATED] Region-based Routing
|
||||
|
||||
:::info
|
||||
|
||||
This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead
|
||||
|
||||
:::
|
||||
|
||||
|
||||
Route specific customers to eu-only models.
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Proxy - Load Balancing
|
||||
Load balance multiple instances of the same model
|
||||
|
||||
|
@ -10,6 +13,159 @@ For more details on routing strategies / params, see [Routing](../routing.md)
|
|||
|
||||
:::
|
||||
|
||||
## Quick Start - Load Balancing
|
||||
#### Step 1 - Set deployments on config
|
||||
|
||||
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/<your-deployment-name>
|
||||
api_base: <your-azure-endpoint>
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-ca
|
||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 6
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-large
|
||||
api_base: https://openai-france-1234.openai.azure.com/
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 1440
|
||||
|
||||
router_settings:
|
||||
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||
num_retries: 2
|
||||
timeout: 30 # 30 seconds
|
||||
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||
redis_password: <your redis password>
|
||||
redis_port: 1992
|
||||
```
|
||||
|
||||
:::info
|
||||
Detailed information about [routing strategies can be found here](../routing)
|
||||
:::
|
||||
|
||||
#### Step 2: Start Proxy with config
|
||||
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
### Test - Simple Call
|
||||
|
||||
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
|
||||
|
||||
👉 Key Change: `model="gpt-3.5-turbo"`
|
||||
|
||||
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
import os
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "anything"
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:4000",
|
||||
model="gpt-3.5-turbo",
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Test - Loadbalancing
|
||||
|
||||
In this request, the following will occur:
|
||||
1. A rate limit exception will be raised
|
||||
2. LiteLLM proxy will retry the request on the model group (default is 3).
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hi there!"}
|
||||
],
|
||||
"mock_testing_rate_limit_error": true
|
||||
}'
|
||||
```
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
|
||||
|
||||
|
||||
## Load Balancing using multiple litellm instances (Kubernetes, Auto Scaling)
|
||||
|
||||
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
|
||||
|
|
|
@ -2,15 +2,61 @@ import Image from '@theme/IdealImage';
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Proxy - Fallbacks, Retries
|
||||
# Fallbacks
|
||||
|
||||
- Quick Start [load balancing](#test---load-balancing)
|
||||
- Quick Start [client side fallbacks](#test---client-side-fallbacks)
|
||||
If a call fails after num_retries, fallback to another model group.
|
||||
|
||||
- Quick Start [load balancing](./load_balancing.md)
|
||||
- Quick Start [client side fallbacks](#client-side-fallbacks)
|
||||
|
||||
|
||||
Fallbacks are typically done from one `model_name` to another `model_name`.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Setup fallbacks
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/<your-deployment-name>",
|
||||
"api_base": "<your-azure-endpoint>",
|
||||
"api_key": "<your-azure-api-key>",
|
||||
"rpm": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-4",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-4-ca",
|
||||
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com/",
|
||||
"api_key": "<your-azure-api-key>",
|
||||
"rpm": 6
|
||||
}
|
||||
}
|
||||
],
|
||||
fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}] # 👈 KEY CHANGE
|
||||
)
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
## Quick Start - Load Balancing
|
||||
#### Step 1 - Set deployments on config
|
||||
|
||||
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
|
@ -19,147 +65,93 @@ model_list:
|
|||
api_base: <your-azure-endpoint>
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||
- model_name: gpt-3.5-turbo
|
||||
- model_name: gpt-4
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-ca
|
||||
model: azure/gpt-4-ca
|
||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 6
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-large
|
||||
api_base: https://openai-france-1234.openai.azure.com/
|
||||
api_key: <your-azure-api-key>
|
||||
rpm: 1440
|
||||
|
||||
router_settings:
|
||||
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||
num_retries: 2
|
||||
timeout: 30 # 30 seconds
|
||||
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||
redis_password: <your redis password>
|
||||
redis_port: 1992
|
||||
fallbacks: [{"gpt-3.5-turbo": ["gpt-4"]}]
|
||||
```
|
||||
|
||||
:::info
|
||||
Detailed information about [routing strategies can be found here](../routing)
|
||||
:::
|
||||
|
||||
#### Step 2: Start Proxy with config
|
||||
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
### Test - Simple Call
|
||||
|
||||
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
|
||||
|
||||
👉 Key Change: `model="gpt-3.5-turbo"`
|
||||
|
||||
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
import os
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "anything"
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:4000",
|
||||
model="gpt-3.5-turbo",
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Test - Loadbalancing
|
||||
### 2. Start Proxy
|
||||
|
||||
In this request, the following will occur:
|
||||
1. A rate limit exception will be raised
|
||||
2. LiteLLM proxy will retry the request on the model group (default is 3).
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
### 3. Test Fallbacks
|
||||
|
||||
Pass `mock_testing_fallbacks=true` in request body, to trigger fallbacks.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
|
||||
```python
|
||||
|
||||
from litellm import Router
|
||||
|
||||
model_list = [{..}, {..}] # defined in Step 1.
|
||||
|
||||
router = Router(model_list=model_list, fallbacks=[{"bad-model": ["my-good-model"]}])
|
||||
|
||||
response = router.completion(
|
||||
model="bad-model",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
mock_testing_fallbacks=True,
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hi there!"}
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_rate_limit_error": true
|
||||
}'
|
||||
"mock_testing_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
|
||||
|
||||
### Explanation
|
||||
|
||||
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
||||
|
||||
You can also set [`default_fallbacks`](#default-fallbacks), in case a specific model group is misconfigured / bad.
|
||||
|
||||
There are 3 types of fallbacks:
|
||||
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
|
||||
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
|
||||
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
|
||||
|
||||
|
||||
## Client Side Fallbacks
|
||||
|
||||
Set fallbacks in the `.completion()` call for SDK and client-side for proxy.
|
||||
|
||||
### Test - Client Side Fallbacks
|
||||
In this request the following will occur:
|
||||
1. The request to `model="zephyr-beta"` will fail
|
||||
2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
|
||||
|
@ -168,7 +160,32 @@ In this request the following will occur:
|
|||
👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(model_list=[..]) # defined in Step 1.
|
||||
|
||||
resp = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
mock_testing_fallbacks=True, # 👈 trigger fallbacks
|
||||
fallbacks=[
|
||||
{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "What is LiteLLM?"}],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(resp)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||
|
||||
```python
|
||||
|
@ -197,8 +214,6 @@ print(response)
|
|||
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
Pass `metadata` as part of the request body
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
|
@ -252,24 +267,282 @@ print(response)
|
|||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
### Control Fallback Prompts
|
||||
|
||||
<!--
|
||||
### Test it!
|
||||
Pass in messages/temperature/etc. per model in fallback (works for embedding/image generation/etc. as well).
|
||||
|
||||
Key Change:
|
||||
|
||||
```
|
||||
fallbacks = [
|
||||
{
|
||||
"model": <model_name>,
|
||||
"messages": <model-specific-messages>
|
||||
... # any other model-specific parameters
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(model_list=[..]) # defined in Step 1.
|
||||
|
||||
resp = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
mock_testing_fallbacks=True, # 👈 trigger fallbacks
|
||||
fallbacks=[
|
||||
{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "What is LiteLLM?"}],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(resp)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="zephyr-beta",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"fallbacks": [{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||
}]
|
||||
}
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
|
||||
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "user", "content": "what color is red"}
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Hi, how are you ?"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"fallbacks": [{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||
}],
|
||||
"mock_testing_fallbacks": true
|
||||
}'
|
||||
``` -->
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
import os
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "anything"
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:4000",
|
||||
model="zephyr-beta",
|
||||
extra_body={
|
||||
"fallbacks": [{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||
}]
|
||||
}
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Content Policy Violation Fallback
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("content filtering policy"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Context Window Exceeded Fallback
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("prompt is too long"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Advanced
|
||||
### Fallbacks + Retries + Timeouts + Cooldowns
|
||||
|
@ -684,81 +957,6 @@ print(response)
|
|||
print(f"response.headers.get('x-litellm-model-api-base')")
|
||||
```
|
||||
|
||||
### Custom Timeouts, Stream Timeouts - Per Model
|
||||
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-eu
|
||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||
api_key: <your-key>
|
||||
timeout: 0.1 # timeout in (seconds)
|
||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||
max_retries: 5
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-ca
|
||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key:
|
||||
timeout: 0.1 # timeout in (seconds)
|
||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||
max_retries: 5
|
||||
|
||||
```
|
||||
|
||||
#### Start Proxy
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
|
||||
### Setting Dynamic Timeouts - Per Request
|
||||
|
||||
LiteLLM Proxy supports setting a `timeout` per request
|
||||
|
||||
**Example Usage**
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "user", "content": "what color is red"}
|
||||
],
|
||||
"logit_bias": {12481: 100},
|
||||
"timeout": 1
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "user", "content": "what color is red"}
|
||||
],
|
||||
logit_bias={12481: 100},
|
||||
timeout=1
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Setting Fallbacks for Wildcard Models
|
||||
|
||||
You can set fallbacks for wildcard models (e.g. `azure/*`) in your config file.
|
||||
|
|
|
@ -1,4 +1,11 @@
|
|||
# Team-based Routing
|
||||
# [DEPRECATED] Team-based Routing
|
||||
|
||||
:::info
|
||||
|
||||
This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead
|
||||
|
||||
:::
|
||||
|
||||
|
||||
## Routing
|
||||
Route calls to different model groups based on the team-id
|
||||
|
|
178
docs/my-website/docs/proxy/timeout.md
Normal file
178
docs/my-website/docs/proxy/timeout.md
Normal file
|
@ -0,0 +1,178 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Timeouts
|
||||
|
||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||
|
||||
### Global Timeouts
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
model_list = [{...}]
|
||||
|
||||
router = Router(model_list=model_list,
|
||||
timeout=30) # raise timeout error if call takes > 30s
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
timeout: 30 # sets a 30s timeout for the entire call
|
||||
```
|
||||
|
||||
**Start Proxy**
|
||||
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Custom Timeouts, Stream Timeouts - Per Model
|
||||
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
import asyncio
|
||||
|
||||
model_list = [{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"timeout": 300 # sets a 5 minute timeout
|
||||
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||
}
|
||||
}]
|
||||
|
||||
# init router
|
||||
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||
async def router_acompletion():
|
||||
response = await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
)
|
||||
print(response)
|
||||
return response
|
||||
|
||||
asyncio.run(router_acompletion())
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-eu
|
||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||
api_key: <your-key>
|
||||
timeout: 0.1 # timeout in (seconds)
|
||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||
max_retries: 5
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-ca
|
||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key:
|
||||
timeout: 0.1 # timeout in (seconds)
|
||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||
max_retries: 5
|
||||
|
||||
```
|
||||
|
||||
|
||||
**Start Proxy**
|
||||
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Setting Dynamic Timeouts - Per Request
|
||||
|
||||
LiteLLM supports setting a `timeout` per request
|
||||
|
||||
**Example Usage**
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
model_list = [{...}]
|
||||
router = Router(model_list=model_list)
|
||||
|
||||
response = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "what color is red"}],
|
||||
timeout=1
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "user", "content": "what color is red"}
|
||||
],
|
||||
"logit_bias": {12481: 100},
|
||||
"timeout": 1
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "user", "content": "what color is red"}
|
||||
],
|
||||
logit_bias={12481: 100},
|
||||
timeout=1
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
|
@ -3,7 +3,7 @@ import Tabs from '@theme/Tabs';
|
|||
import TabItem from '@theme/TabItem';
|
||||
|
||||
|
||||
# Router - Load Balancing, Fallbacks
|
||||
# Router - Load Balancing
|
||||
|
||||
LiteLLM manages:
|
||||
- Load-balance across multiple deployments (e.g. Azure/OpenAI)
|
||||
|
@ -855,52 +855,6 @@ router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈
|
|||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
|
||||
|
||||
### Timeouts
|
||||
|
||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||
|
||||
**Global Timeouts**
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
model_list = [{...}]
|
||||
|
||||
router = Router(model_list=model_list,
|
||||
timeout=30) # raise timeout error if call takes > 30s
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
**Timeouts per model**
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
import asyncio
|
||||
|
||||
model_list = [{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"timeout": 300 # sets a 5 minute timeout
|
||||
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||
}
|
||||
}]
|
||||
|
||||
# init router
|
||||
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||
async def router_acompletion():
|
||||
response = await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
)
|
||||
print(response)
|
||||
return response
|
||||
|
||||
asyncio.run(router_acompletion())
|
||||
```
|
||||
### Cooldowns
|
||||
|
||||
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
||||
|
@ -1125,248 +1079,6 @@ router_settings:
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Fallbacks
|
||||
|
||||
If a call fails after num_retries, fall back to another model group.
|
||||
|
||||
#### Quick Start
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
router = Router(
|
||||
model_list=[
|
||||
{ # bad model
|
||||
"model_name": "bad-model",
|
||||
"litellm_params": {
|
||||
"model": "openai/my-bad-model",
|
||||
"api_key": "my-bad-api-key",
|
||||
"mock_response": "Bad call"
|
||||
},
|
||||
},
|
||||
{ # good model
|
||||
"model_name": "my-good-model",
|
||||
"litellm_params": {
|
||||
"model": "gpt-4o",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
"mock_response": "Good call"
|
||||
},
|
||||
},
|
||||
],
|
||||
fallbacks=[{"bad-model": ["my-good-model"]}] # 👈 KEY CHANGE
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="bad-model",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
mock_testing_fallbacks=True,
|
||||
)
|
||||
```
|
||||
|
||||
If the error is a context window exceeded error, fall back to a larger model group (if given).
|
||||
|
||||
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
||||
|
||||
You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad.
|
||||
|
||||
There are 3 types of fallbacks:
|
||||
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
|
||||
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
|
||||
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
|
||||
|
||||
**Content Policy Violation Fallback**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("content filtering policy"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Context Window Exceeded Fallback**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("prompt is too long"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Regular Fallbacks**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("this is a rate limit error"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Caching
|
||||
|
||||
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
|
||||
|
@ -1808,48 +1520,6 @@ response = router.completion(
|
|||
If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
|
||||
|
||||
|
||||
## Init Params for the litellm.Router
|
||||
|
||||
```python
|
||||
def __init__(
|
||||
model_list: Optional[list] = None,
|
||||
|
||||
## CACHING ##
|
||||
redis_url: Optional[str] = None,
|
||||
redis_host: Optional[str] = None,
|
||||
redis_port: Optional[int] = None,
|
||||
redis_password: Optional[str] = None,
|
||||
cache_responses: Optional[bool] = False,
|
||||
cache_kwargs: dict = {}, # additional kwargs to pass to RedisCache (see caching.py)
|
||||
caching_groups: Optional[
|
||||
List[tuple]
|
||||
] = None, # if you want to cache across model groups
|
||||
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
|
||||
|
||||
## RELIABILITY ##
|
||||
num_retries: int = 0,
|
||||
timeout: Optional[float] = None,
|
||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||
fallbacks: Optional[List] = None,
|
||||
default_fallbacks: Optional[List] = None
|
||||
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||
context_window_fallbacks: Optional[List] = None,
|
||||
model_group_alias: Optional[dict] = {},
|
||||
retry_after: int = 0, # (min) time to wait before retrying a failed request
|
||||
routing_strategy: Literal[
|
||||
"simple-shuffle",
|
||||
"least-busy",
|
||||
"usage-based-routing",
|
||||
"latency-based-routing",
|
||||
"cost-based-routing",
|
||||
] = "simple-shuffle",
|
||||
|
||||
## DEBUGGING ##
|
||||
set_verbose: bool = False, # set this to True for seeing logs
|
||||
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
|
||||
):
|
||||
```
|
||||
|
||||
## Debugging Router
|
||||
### Basic Debugging
|
||||
|
|
|
@ -290,7 +290,7 @@ const sidebars = {
|
|||
description: "Learn how to load balance, route, and set fallbacks for your LLM requests",
|
||||
slug: "/routing-load-balancing",
|
||||
},
|
||||
items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"],
|
||||
items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/timeout", "proxy/tag_routing", "proxy/provider_budget_routing", "wildcard_routing"],
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
|
@ -395,6 +395,8 @@ const sidebars = {
|
|||
"proxy/pii_masking",
|
||||
"extras/code_quality",
|
||||
"rules",
|
||||
"proxy/team_based_routing",
|
||||
"proxy/customer_routing",
|
||||
"proxy_server",
|
||||
{
|
||||
type: "category",
|
||||
|
|
|
@ -69,6 +69,7 @@ from litellm.router_utils.cooldown_handlers import (
|
|||
_set_cooldown_deployments,
|
||||
)
|
||||
from litellm.router_utils.fallback_event_handlers import (
|
||||
_check_non_standard_fallback_format,
|
||||
get_fallback_model_group,
|
||||
run_async_fallback,
|
||||
)
|
||||
|
@ -2647,6 +2648,27 @@ class Router:
|
|||
|
||||
try:
|
||||
verbose_router_logger.info("Trying to fallback b/w models")
|
||||
|
||||
# check if client-side fallbacks are used (e.g. fallbacks = ["gpt-3.5-turbo", "claude-3-haiku"] or fallbacks=[{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}]
|
||||
is_non_standard_fallback_format = _check_non_standard_fallback_format(
|
||||
fallbacks=fallbacks
|
||||
)
|
||||
|
||||
if is_non_standard_fallback_format:
|
||||
input_kwargs.update(
|
||||
{
|
||||
"fallback_model_group": fallbacks,
|
||||
"original_model_group": original_model_group,
|
||||
}
|
||||
)
|
||||
|
||||
response = await run_async_fallback(
|
||||
*args,
|
||||
**input_kwargs,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
if isinstance(e, litellm.ContextWindowExceededError):
|
||||
if context_window_fallbacks is not None:
|
||||
fallback_model_group: Optional[List[str]] = (
|
||||
|
@ -2722,7 +2744,7 @@ class Router:
|
|||
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
|
||||
fallback_model_group, generic_fallback_idx = (
|
||||
get_fallback_model_group(
|
||||
fallbacks=fallbacks,
|
||||
fallbacks=fallbacks, # if fallbacks = [{"gpt-3.5-turbo": ["claude-3-haiku"]}]
|
||||
model_group=cast(str, model_group),
|
||||
)
|
||||
)
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_router_logger
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.types.router import LiteLLMParamsTypedDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.router import Router as _Router
|
||||
|
@ -67,7 +68,7 @@ def get_fallback_model_group(
|
|||
elif list(item.keys())[0] == "*": # check generic fallback
|
||||
generic_fallback_idx = idx
|
||||
elif isinstance(item, str):
|
||||
fallback_model_group = [fallbacks.pop(idx)]
|
||||
fallback_model_group = [fallbacks.pop(idx)] # returns single-item list
|
||||
## if none, check for generic fallback
|
||||
if fallback_model_group is None:
|
||||
if stripped_model_fallback is not None:
|
||||
|
@ -122,9 +123,12 @@ async def run_async_fallback(
|
|||
# LOGGING
|
||||
kwargs = litellm_router.log_retry(kwargs=kwargs, e=original_exception)
|
||||
verbose_router_logger.info(f"Falling back to model_group = {mg}")
|
||||
if isinstance(mg, str):
|
||||
kwargs["model"] = mg
|
||||
elif isinstance(mg, dict):
|
||||
kwargs.update(mg)
|
||||
kwargs.setdefault("metadata", {}).update(
|
||||
{"model_group": mg}
|
||||
{"model_group": kwargs.get("model", None)}
|
||||
) # update model_group used, if fallbacks are done
|
||||
kwargs["fallback_depth"] = fallback_depth + 1
|
||||
kwargs["max_fallbacks"] = max_fallbacks
|
||||
|
@ -310,3 +314,31 @@ async def log_failure_fallback_event(
|
|||
verbose_router_logger.error(
|
||||
f"Error in log_failure_fallback_event: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
def _check_non_standard_fallback_format(fallbacks: Optional[List[Any]]) -> bool:
|
||||
"""
|
||||
Checks if the fallbacks list is a list of strings or a list of dictionaries.
|
||||
|
||||
If
|
||||
- List[str]: e.g. ["claude-3-haiku", "openai/o-1"]
|
||||
- List[Dict[<LiteLLMParamsTypedDict>, Any]]: e.g. [{"model": "claude-3-haiku", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}]
|
||||
|
||||
If [{"gpt-3.5-turbo": ["claude-3-haiku"]}] then standard format.
|
||||
"""
|
||||
if fallbacks is None or not isinstance(fallbacks, list) or len(fallbacks) == 0:
|
||||
return False
|
||||
if all(isinstance(item, str) for item in fallbacks):
|
||||
return True
|
||||
elif all(isinstance(item, dict) for item in fallbacks):
|
||||
for key in LiteLLMParamsTypedDict.__annotations__.keys():
|
||||
if key in fallbacks[0].keys():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def run_non_standard_fallback_format(
|
||||
fallbacks: Union[List[str], List[Dict[str, Any]]], model_group: str
|
||||
):
|
||||
pass
|
||||
|
|
|
@ -281,6 +281,7 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
|
|||
}
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="causes openai rate limit errors")
|
||||
def test_audio_speech_cost_calc():
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
|
|
|
@ -1567,3 +1567,38 @@ def test_get_fallback_model_group():
|
|||
}
|
||||
fallback_model_group, _ = get_fallback_model_group(**args)
|
||||
assert fallback_model_group == ["claude-3-haiku"]
|
||||
|
||||
|
||||
def test_fallbacks_with_different_messages():
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "claude-3-haiku",
|
||||
"litellm_params": {
|
||||
"model": "claude-3-haiku-20240307",
|
||||
"api_key": os.getenv("ANTHROPIC_API_KEY"),
|
||||
},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
resp = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
mock_testing_fallbacks=True,
|
||||
fallbacks=[
|
||||
{
|
||||
"model": "claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(resp)
|
||||
|
|
|
@ -3922,7 +3922,7 @@ def test_unit_test_perplexity_citations_chunk():
|
|||
],
|
||||
)
|
||||
@pytest.mark.flaky(retries=3, delay=1)
|
||||
def test_streaming_tool_calls_valid_json_str(model):
|
||||
def test_aastreaming_tool_calls_valid_json_str(model):
|
||||
if "vertex_ai" in model:
|
||||
from test_amazing_vertex_completion import (
|
||||
load_vertex_ai_credentials,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue