mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
Controll fallback prompts client-side (#7334)
* feat(router.py): support passing model-specific messages in fallbacks * docs(routing.md): separate router timeouts into separate doc allow for 1 fallbacks doc (across proxy/router) * docs(routing.md): cleanup router docs * docs(reliability.md): cleanup docs * docs(reliability.md): cleaned up fallback doc just have 1 doc across sdk/proxy simplifies docs * docs(reliability.md): add setting model-specific fallback prompts * fix: fix linting errors * test: skip test causing openai rate limit errros * test: fix test * test: run vertex test first to catch error
This commit is contained in:
parent
495b009a22
commit
e6bdec4eed
12 changed files with 861 additions and 553 deletions
|
@ -1,4 +1,11 @@
|
||||||
# Region-based Routing
|
# [DEPRECATED] Region-based Routing
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
||||||
Route specific customers to eu-only models.
|
Route specific customers to eu-only models.
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Proxy - Load Balancing
|
# Proxy - Load Balancing
|
||||||
Load balance multiple instances of the same model
|
Load balance multiple instances of the same model
|
||||||
|
|
||||||
|
@ -10,6 +13,159 @@ For more details on routing strategies / params, see [Routing](../routing.md)
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
## Quick Start - Load Balancing
|
||||||
|
#### Step 1 - Set deployments on config
|
||||||
|
|
||||||
|
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/<your-deployment-name>
|
||||||
|
api_base: <your-azure-endpoint>
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-large
|
||||||
|
api_base: https://openai-france-1234.openai.azure.com/
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 1440
|
||||||
|
|
||||||
|
router_settings:
|
||||||
|
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||||
|
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||||
|
num_retries: 2
|
||||||
|
timeout: 30 # 30 seconds
|
||||||
|
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||||
|
redis_password: <your redis password>
|
||||||
|
redis_port: 1992
|
||||||
|
```
|
||||||
|
|
||||||
|
:::info
|
||||||
|
Detailed information about [routing strategies can be found here](../routing)
|
||||||
|
:::
|
||||||
|
|
||||||
|
#### Step 2: Start Proxy with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test - Simple Call
|
||||||
|
|
||||||
|
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
|
||||||
|
|
||||||
|
👉 Key Change: `model="gpt-3.5-turbo"`
|
||||||
|
|
||||||
|
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
|
||||||
|
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["OPENAI_API_KEY"] = "anything"
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:4000",
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
### Test - Loadbalancing
|
||||||
|
|
||||||
|
In this request, the following will occur:
|
||||||
|
1. A rate limit exception will be raised
|
||||||
|
2. LiteLLM proxy will retry the request on the model group (default is 3).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hi there!"}
|
||||||
|
],
|
||||||
|
"mock_testing_rate_limit_error": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
|
||||||
|
|
||||||
|
|
||||||
## Load Balancing using multiple litellm instances (Kubernetes, Auto Scaling)
|
## Load Balancing using multiple litellm instances (Kubernetes, Auto Scaling)
|
||||||
|
|
||||||
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
|
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
|
||||||
|
|
|
@ -2,15 +2,61 @@ import Image from '@theme/IdealImage';
|
||||||
import Tabs from '@theme/Tabs';
|
import Tabs from '@theme/Tabs';
|
||||||
import TabItem from '@theme/TabItem';
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Proxy - Fallbacks, Retries
|
# Fallbacks
|
||||||
|
|
||||||
- Quick Start [load balancing](#test---load-balancing)
|
If a call fails after num_retries, fallback to another model group.
|
||||||
- Quick Start [client side fallbacks](#test---client-side-fallbacks)
|
|
||||||
|
- Quick Start [load balancing](./load_balancing.md)
|
||||||
|
- Quick Start [client side fallbacks](#client-side-fallbacks)
|
||||||
|
|
||||||
|
|
||||||
|
Fallbacks are typically done from one `model_name` to another `model_name`.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Setup fallbacks
|
||||||
|
|
||||||
|
Key change:
|
||||||
|
|
||||||
|
```python
|
||||||
|
fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/<your-deployment-name>",
|
||||||
|
"api_base": "<your-azure-endpoint>",
|
||||||
|
"api_key": "<your-azure-api-key>",
|
||||||
|
"rpm": 6
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-4-ca",
|
||||||
|
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com/",
|
||||||
|
"api_key": "<your-azure-api-key>",
|
||||||
|
"rpm": 6
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}] # 👈 KEY CHANGE
|
||||||
|
)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
## Quick Start - Load Balancing
|
|
||||||
#### Step 1 - Set deployments on config
|
|
||||||
|
|
||||||
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
|
|
||||||
```yaml
|
```yaml
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: gpt-3.5-turbo
|
- model_name: gpt-3.5-turbo
|
||||||
|
@ -19,147 +65,93 @@ model_list:
|
||||||
api_base: <your-azure-endpoint>
|
api_base: <your-azure-endpoint>
|
||||||
api_key: <your-azure-api-key>
|
api_key: <your-azure-api-key>
|
||||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||||
- model_name: gpt-3.5-turbo
|
- model_name: gpt-4
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: azure/gpt-turbo-small-ca
|
model: azure/gpt-4-ca
|
||||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
api_key: <your-azure-api-key>
|
api_key: <your-azure-api-key>
|
||||||
rpm: 6
|
rpm: 6
|
||||||
- model_name: gpt-3.5-turbo
|
|
||||||
litellm_params:
|
|
||||||
model: azure/gpt-turbo-large
|
|
||||||
api_base: https://openai-france-1234.openai.azure.com/
|
|
||||||
api_key: <your-azure-api-key>
|
|
||||||
rpm: 1440
|
|
||||||
|
|
||||||
router_settings:
|
router_settings:
|
||||||
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
fallbacks: [{"gpt-3.5-turbo": ["gpt-4"]}]
|
||||||
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
|
||||||
num_retries: 2
|
|
||||||
timeout: 30 # 30 seconds
|
|
||||||
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
|
||||||
redis_password: <your redis password>
|
|
||||||
redis_port: 1992
|
|
||||||
```
|
```
|
||||||
|
|
||||||
:::info
|
|
||||||
Detailed information about [routing strategies can be found here](../routing)
|
|
||||||
:::
|
|
||||||
|
|
||||||
#### Step 2: Start Proxy with config
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ litellm --config /path/to/config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### Test - Simple Call
|
|
||||||
|
|
||||||
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
|
|
||||||
|
|
||||||
👉 Key Change: `model="gpt-3.5-turbo"`
|
|
||||||
|
|
||||||
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
|
|
||||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
client = openai.OpenAI(
|
|
||||||
api_key="anything",
|
|
||||||
base_url="http://0.0.0.0:4000"
|
|
||||||
)
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "this is a test request, write a short poem"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
<TabItem value="Curl" label="Curl Request">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "what llm are you"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="langchain" label="Langchain">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from langchain.chat_models import ChatOpenAI
|
|
||||||
from langchain.prompts.chat import (
|
|
||||||
ChatPromptTemplate,
|
|
||||||
HumanMessagePromptTemplate,
|
|
||||||
SystemMessagePromptTemplate,
|
|
||||||
)
|
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
|
||||||
import os
|
|
||||||
|
|
||||||
os.environ["OPENAI_API_KEY"] = "anything"
|
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
|
||||||
openai_api_base="http://0.0.0.0:4000",
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
)
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
SystemMessage(
|
|
||||||
content="You are a helpful assistant that im using to make a test request to."
|
|
||||||
),
|
|
||||||
HumanMessage(
|
|
||||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
|
||||||
),
|
|
||||||
]
|
|
||||||
response = chat(messages)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
### Test - Loadbalancing
|
### 2. Start Proxy
|
||||||
|
|
||||||
In this request, the following will occur:
|
```bash
|
||||||
1. A rate limit exception will be raised
|
litellm --config /path/to/config.yaml
|
||||||
2. LiteLLM proxy will retry the request on the model group (default is 3).
|
```
|
||||||
|
|
||||||
|
### 3. Test Fallbacks
|
||||||
|
|
||||||
|
Pass `mock_testing_fallbacks=true` in request body, to trigger fallbacks.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
model_list = [{..}, {..}] # defined in Step 1.
|
||||||
|
|
||||||
|
router = Router(model_list=model_list, fallbacks=[{"bad-model": ["my-good-model"]}])
|
||||||
|
|
||||||
|
response = router.completion(
|
||||||
|
model="bad-model",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_testing_fallbacks=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-H 'Authorization: Bearer sk-1234' \
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
-d '{
|
-D '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "my-bad-model",
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": "Hi there!"}
|
{
|
||||||
],
|
"role": "user",
|
||||||
"mock_testing_rate_limit_error": true
|
"content": "ping"
|
||||||
}'
|
}
|
||||||
|
],
|
||||||
|
"mock_testing_fallbacks": true # 👈 KEY CHANGE
|
||||||
|
}
|
||||||
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Explanation
|
||||||
|
|
||||||
|
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
||||||
|
|
||||||
|
You can also set [`default_fallbacks`](#default-fallbacks), in case a specific model group is misconfigured / bad.
|
||||||
|
|
||||||
|
There are 3 types of fallbacks:
|
||||||
|
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
|
||||||
|
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
|
||||||
|
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
|
||||||
|
|
||||||
|
|
||||||
|
## Client Side Fallbacks
|
||||||
|
|
||||||
|
Set fallbacks in the `.completion()` call for SDK and client-side for proxy.
|
||||||
|
|
||||||
### Test - Client Side Fallbacks
|
|
||||||
In this request the following will occur:
|
In this request the following will occur:
|
||||||
1. The request to `model="zephyr-beta"` will fail
|
1. The request to `model="zephyr-beta"` will fail
|
||||||
2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
|
2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
|
||||||
|
@ -168,7 +160,32 @@ In this request the following will occur:
|
||||||
👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
|
👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(model_list=[..]) # defined in Step 1.
|
||||||
|
|
||||||
|
resp = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_testing_fallbacks=True, # 👈 trigger fallbacks
|
||||||
|
fallbacks=[
|
||||||
|
{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "What is LiteLLM?"}],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -197,8 +214,6 @@ print(response)
|
||||||
|
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
Pass `metadata` as part of the request body
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
|
@ -252,24 +267,282 @@ print(response)
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
### Control Fallback Prompts
|
||||||
|
|
||||||
<!--
|
Pass in messages/temperature/etc. per model in fallback (works for embedding/image generation/etc. as well).
|
||||||
### Test it!
|
|
||||||
|
|
||||||
|
Key Change:
|
||||||
|
|
||||||
|
```
|
||||||
|
fallbacks = [
|
||||||
|
{
|
||||||
|
"model": <model_name>,
|
||||||
|
"messages": <model-specific-messages>
|
||||||
|
... # any other model-specific parameters
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(model_list=[..]) # defined in Step 1.
|
||||||
|
|
||||||
|
resp = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_testing_fallbacks=True, # 👈 trigger fallbacks
|
||||||
|
fallbacks=[
|
||||||
|
{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "What is LiteLLM?"}],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="zephyr-beta",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
extra_body={
|
||||||
|
"fallbacks": [{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
|
-d '{
|
||||||
"messages": [
|
"model": "gpt-3.5-turbo",
|
||||||
{"role": "user", "content": "what color is red"}
|
"messages": [
|
||||||
],
|
{
|
||||||
"mock_testing_fallbacks": true
|
"role": "user",
|
||||||
}'
|
"content": [
|
||||||
``` -->
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Hi, how are you ?"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fallbacks": [{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||||
|
}],
|
||||||
|
"mock_testing_fallbacks": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["OPENAI_API_KEY"] = "anything"
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:4000",
|
||||||
|
model="zephyr-beta",
|
||||||
|
extra_body={
|
||||||
|
"fallbacks": [{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "What is LiteLLM?"}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Content Policy Violation Fallback
|
||||||
|
|
||||||
|
Key change:
|
||||||
|
|
||||||
|
```python
|
||||||
|
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "claude-2",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "claude-2",
|
||||||
|
"api_key": "",
|
||||||
|
"mock_response": Exception("content filtering policy"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "my-fallback-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "claude-2",
|
||||||
|
"api_key": "",
|
||||||
|
"mock_response": "This works!",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||||
|
# fallbacks=[..], # [OPTIONAL]
|
||||||
|
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||||
|
)
|
||||||
|
|
||||||
|
response = router.completion(
|
||||||
|
model="claude-2",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
In your proxy config.yaml just add this line 👇
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||||
|
```
|
||||||
|
|
||||||
|
Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
|
# RUNNING on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Context Window Exceeded Fallback
|
||||||
|
|
||||||
|
Key change:
|
||||||
|
|
||||||
|
```python
|
||||||
|
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "claude-2",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "claude-2",
|
||||||
|
"api_key": "",
|
||||||
|
"mock_response": Exception("prompt is too long"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "my-fallback-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "claude-2",
|
||||||
|
"api_key": "",
|
||||||
|
"mock_response": "This works!",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||||
|
# fallbacks=[..], # [OPTIONAL]
|
||||||
|
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||||
|
)
|
||||||
|
|
||||||
|
response = router.completion(
|
||||||
|
model="claude-2",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
In your proxy config.yaml just add this line 👇
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||||
|
```
|
||||||
|
|
||||||
|
Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
|
# RUNNING on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
## Advanced
|
## Advanced
|
||||||
### Fallbacks + Retries + Timeouts + Cooldowns
|
### Fallbacks + Retries + Timeouts + Cooldowns
|
||||||
|
@ -684,81 +957,6 @@ print(response)
|
||||||
print(f"response.headers.get('x-litellm-model-api-base')")
|
print(f"response.headers.get('x-litellm-model-api-base')")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Custom Timeouts, Stream Timeouts - Per Model
|
|
||||||
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: gpt-3.5-turbo
|
|
||||||
litellm_params:
|
|
||||||
model: azure/gpt-turbo-small-eu
|
|
||||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
|
||||||
api_key: <your-key>
|
|
||||||
timeout: 0.1 # timeout in (seconds)
|
|
||||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
|
||||||
max_retries: 5
|
|
||||||
- model_name: gpt-3.5-turbo
|
|
||||||
litellm_params:
|
|
||||||
model: azure/gpt-turbo-small-ca
|
|
||||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
|
||||||
api_key:
|
|
||||||
timeout: 0.1 # timeout in (seconds)
|
|
||||||
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
|
||||||
max_retries: 5
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Start Proxy
|
|
||||||
```shell
|
|
||||||
$ litellm --config /path/to/config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### Setting Dynamic Timeouts - Per Request
|
|
||||||
|
|
||||||
LiteLLM Proxy supports setting a `timeout` per request
|
|
||||||
|
|
||||||
**Example Usage**
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="Curl" label="Curl Request">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data-raw '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [
|
|
||||||
{"role": "user", "content": "what color is red"}
|
|
||||||
],
|
|
||||||
"logit_bias": {12481: 100},
|
|
||||||
"timeout": 1
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
|
|
||||||
|
|
||||||
client = openai.OpenAI(
|
|
||||||
api_key="anything",
|
|
||||||
base_url="http://0.0.0.0:4000"
|
|
||||||
)
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": "what color is red"}
|
|
||||||
],
|
|
||||||
logit_bias={12481: 100},
|
|
||||||
timeout=1
|
|
||||||
)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
### Setting Fallbacks for Wildcard Models
|
### Setting Fallbacks for Wildcard Models
|
||||||
|
|
||||||
You can set fallbacks for wildcard models (e.g. `azure/*`) in your config file.
|
You can set fallbacks for wildcard models (e.g. `azure/*`) in your config file.
|
||||||
|
|
|
@ -1,4 +1,11 @@
|
||||||
# Team-based Routing
|
# [DEPRECATED] Team-based Routing
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
||||||
## Routing
|
## Routing
|
||||||
Route calls to different model groups based on the team-id
|
Route calls to different model groups based on the team-id
|
||||||
|
|
178
docs/my-website/docs/proxy/timeout.md
Normal file
178
docs/my-website/docs/proxy/timeout.md
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# Timeouts
|
||||||
|
|
||||||
|
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||||
|
|
||||||
|
### Global Timeouts
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
model_list = [{...}]
|
||||||
|
|
||||||
|
router = Router(model_list=model_list,
|
||||||
|
timeout=30) # raise timeout error if call takes > 30s
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
timeout: 30 # sets a 30s timeout for the entire call
|
||||||
|
```
|
||||||
|
|
||||||
|
**Start Proxy**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
### Custom Timeouts, Stream Timeouts - Per Model
|
||||||
|
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"timeout": 300 # sets a 5 minute timeout
|
||||||
|
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||||
|
async def router_acompletion():
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
asyncio.run(router_acompletion())
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-eu
|
||||||
|
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||||
|
api_key: <your-key>
|
||||||
|
timeout: 0.1 # timeout in (seconds)
|
||||||
|
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||||
|
max_retries: 5
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key:
|
||||||
|
timeout: 0.1 # timeout in (seconds)
|
||||||
|
stream_timeout: 0.01 # timeout for stream requests (seconds)
|
||||||
|
max_retries: 5
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
**Start Proxy**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
### Setting Dynamic Timeouts - Per Request
|
||||||
|
|
||||||
|
LiteLLM supports setting a `timeout` per request
|
||||||
|
|
||||||
|
**Example Usage**
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
model_list = [{...}]
|
||||||
|
router = Router(model_list=model_list)
|
||||||
|
|
||||||
|
response = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "what color is red"}],
|
||||||
|
timeout=1
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "what color is red"}
|
||||||
|
],
|
||||||
|
"logit_bias": {12481: 100},
|
||||||
|
"timeout": 1
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "what color is red"}
|
||||||
|
],
|
||||||
|
logit_bias={12481: 100},
|
||||||
|
timeout=1
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
|
@ -3,7 +3,7 @@ import Tabs from '@theme/Tabs';
|
||||||
import TabItem from '@theme/TabItem';
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
|
||||||
# Router - Load Balancing, Fallbacks
|
# Router - Load Balancing
|
||||||
|
|
||||||
LiteLLM manages:
|
LiteLLM manages:
|
||||||
- Load-balance across multiple deployments (e.g. Azure/OpenAI)
|
- Load-balance across multiple deployments (e.g. Azure/OpenAI)
|
||||||
|
@ -855,52 +855,6 @@ router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈
|
||||||
|
|
||||||
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
|
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
|
||||||
|
|
||||||
### Timeouts
|
|
||||||
|
|
||||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
|
||||||
|
|
||||||
**Global Timeouts**
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
|
|
||||||
model_list = [{...}]
|
|
||||||
|
|
||||||
router = Router(model_list=model_list,
|
|
||||||
timeout=30) # raise timeout error if call takes > 30s
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Timeouts per model**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
model_list = [{
|
|
||||||
"model_name": "gpt-3.5-turbo",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "azure/chatgpt-v-2",
|
|
||||||
"api_key": os.getenv("AZURE_API_KEY"),
|
|
||||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
|
||||||
"api_base": os.getenv("AZURE_API_BASE"),
|
|
||||||
"timeout": 300 # sets a 5 minute timeout
|
|
||||||
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
# init router
|
|
||||||
router = Router(model_list=model_list, routing_strategy="least-busy")
|
|
||||||
async def router_acompletion():
|
|
||||||
response = await router.acompletion(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
return response
|
|
||||||
|
|
||||||
asyncio.run(router_acompletion())
|
|
||||||
```
|
|
||||||
### Cooldowns
|
### Cooldowns
|
||||||
|
|
||||||
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
||||||
|
@ -1125,248 +1079,6 @@ router_settings:
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
### Fallbacks
|
|
||||||
|
|
||||||
If a call fails after num_retries, fall back to another model group.
|
|
||||||
|
|
||||||
#### Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
router = Router(
|
|
||||||
model_list=[
|
|
||||||
{ # bad model
|
|
||||||
"model_name": "bad-model",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "openai/my-bad-model",
|
|
||||||
"api_key": "my-bad-api-key",
|
|
||||||
"mock_response": "Bad call"
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{ # good model
|
|
||||||
"model_name": "my-good-model",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "gpt-4o",
|
|
||||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
|
||||||
"mock_response": "Good call"
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
fallbacks=[{"bad-model": ["my-good-model"]}] # 👈 KEY CHANGE
|
|
||||||
)
|
|
||||||
|
|
||||||
response = router.completion(
|
|
||||||
model="bad-model",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
||||||
mock_testing_fallbacks=True,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
If the error is a context window exceeded error, fall back to a larger model group (if given).
|
|
||||||
|
|
||||||
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
|
||||||
|
|
||||||
You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad.
|
|
||||||
|
|
||||||
There are 3 types of fallbacks:
|
|
||||||
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
|
|
||||||
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
|
|
||||||
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
|
|
||||||
|
|
||||||
**Content Policy Violation Fallback**
|
|
||||||
|
|
||||||
Key change:
|
|
||||||
|
|
||||||
```python
|
|
||||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
|
|
||||||
router = Router(
|
|
||||||
model_list=[
|
|
||||||
{
|
|
||||||
"model_name": "claude-2",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": Exception("content filtering policy"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model_name": "my-fallback-model",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": "This works!",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
|
||||||
# fallbacks=[..], # [OPTIONAL]
|
|
||||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
|
||||||
)
|
|
||||||
|
|
||||||
response = router.completion(
|
|
||||||
model="claude-2",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
In your proxy config.yaml just add this line 👇
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
router_settings:
|
|
||||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
Start proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
litellm --config /path/to/config.yaml
|
|
||||||
|
|
||||||
# RUNNING on http://0.0.0.0:4000
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
**Context Window Exceeded Fallback**
|
|
||||||
|
|
||||||
Key change:
|
|
||||||
|
|
||||||
```python
|
|
||||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
|
|
||||||
router = Router(
|
|
||||||
model_list=[
|
|
||||||
{
|
|
||||||
"model_name": "claude-2",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": Exception("prompt is too long"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model_name": "my-fallback-model",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": "This works!",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
|
||||||
# fallbacks=[..], # [OPTIONAL]
|
|
||||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
|
||||||
)
|
|
||||||
|
|
||||||
response = router.completion(
|
|
||||||
model="claude-2",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
In your proxy config.yaml just add this line 👇
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
router_settings:
|
|
||||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
Start proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
litellm --config /path/to/config.yaml
|
|
||||||
|
|
||||||
# RUNNING on http://0.0.0.0:4000
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
**Regular Fallbacks**
|
|
||||||
|
|
||||||
Key change:
|
|
||||||
|
|
||||||
```python
|
|
||||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
|
|
||||||
router = Router(
|
|
||||||
model_list=[
|
|
||||||
{
|
|
||||||
"model_name": "claude-2",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": Exception("this is a rate limit error"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model_name": "my-fallback-model",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "claude-2",
|
|
||||||
"api_key": "",
|
|
||||||
"mock_response": "This works!",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
|
||||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
|
||||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
|
||||||
)
|
|
||||||
|
|
||||||
response = router.completion(
|
|
||||||
model="claude-2",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
In your proxy config.yaml just add this line 👇
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
router_settings:
|
|
||||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
|
||||||
```
|
|
||||||
|
|
||||||
Start proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
litellm --config /path/to/config.yaml
|
|
||||||
|
|
||||||
# RUNNING on http://0.0.0.0:4000
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
|
|
||||||
### Caching
|
### Caching
|
||||||
|
|
||||||
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
|
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
|
||||||
|
@ -1808,48 +1520,6 @@ response = router.completion(
|
||||||
If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
|
If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
|
||||||
|
|
||||||
|
|
||||||
## Init Params for the litellm.Router
|
|
||||||
|
|
||||||
```python
|
|
||||||
def __init__(
|
|
||||||
model_list: Optional[list] = None,
|
|
||||||
|
|
||||||
## CACHING ##
|
|
||||||
redis_url: Optional[str] = None,
|
|
||||||
redis_host: Optional[str] = None,
|
|
||||||
redis_port: Optional[int] = None,
|
|
||||||
redis_password: Optional[str] = None,
|
|
||||||
cache_responses: Optional[bool] = False,
|
|
||||||
cache_kwargs: dict = {}, # additional kwargs to pass to RedisCache (see caching.py)
|
|
||||||
caching_groups: Optional[
|
|
||||||
List[tuple]
|
|
||||||
] = None, # if you want to cache across model groups
|
|
||||||
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
|
|
||||||
|
|
||||||
## RELIABILITY ##
|
|
||||||
num_retries: int = 0,
|
|
||||||
timeout: Optional[float] = None,
|
|
||||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
|
||||||
fallbacks: Optional[List] = None,
|
|
||||||
default_fallbacks: Optional[List] = None
|
|
||||||
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
|
|
||||||
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
|
||||||
context_window_fallbacks: Optional[List] = None,
|
|
||||||
model_group_alias: Optional[dict] = {},
|
|
||||||
retry_after: int = 0, # (min) time to wait before retrying a failed request
|
|
||||||
routing_strategy: Literal[
|
|
||||||
"simple-shuffle",
|
|
||||||
"least-busy",
|
|
||||||
"usage-based-routing",
|
|
||||||
"latency-based-routing",
|
|
||||||
"cost-based-routing",
|
|
||||||
] = "simple-shuffle",
|
|
||||||
|
|
||||||
## DEBUGGING ##
|
|
||||||
set_verbose: bool = False, # set this to True for seeing logs
|
|
||||||
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
|
|
||||||
):
|
|
||||||
```
|
|
||||||
|
|
||||||
## Debugging Router
|
## Debugging Router
|
||||||
### Basic Debugging
|
### Basic Debugging
|
||||||
|
|
|
@ -290,7 +290,7 @@ const sidebars = {
|
||||||
description: "Learn how to load balance, route, and set fallbacks for your LLM requests",
|
description: "Learn how to load balance, route, and set fallbacks for your LLM requests",
|
||||||
slug: "/routing-load-balancing",
|
slug: "/routing-load-balancing",
|
||||||
},
|
},
|
||||||
items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"],
|
items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/timeout", "proxy/tag_routing", "proxy/provider_budget_routing", "wildcard_routing"],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
|
@ -395,6 +395,8 @@ const sidebars = {
|
||||||
"proxy/pii_masking",
|
"proxy/pii_masking",
|
||||||
"extras/code_quality",
|
"extras/code_quality",
|
||||||
"rules",
|
"rules",
|
||||||
|
"proxy/team_based_routing",
|
||||||
|
"proxy/customer_routing",
|
||||||
"proxy_server",
|
"proxy_server",
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
|
|
|
@ -69,6 +69,7 @@ from litellm.router_utils.cooldown_handlers import (
|
||||||
_set_cooldown_deployments,
|
_set_cooldown_deployments,
|
||||||
)
|
)
|
||||||
from litellm.router_utils.fallback_event_handlers import (
|
from litellm.router_utils.fallback_event_handlers import (
|
||||||
|
_check_non_standard_fallback_format,
|
||||||
get_fallback_model_group,
|
get_fallback_model_group,
|
||||||
run_async_fallback,
|
run_async_fallback,
|
||||||
)
|
)
|
||||||
|
@ -2647,6 +2648,27 @@ class Router:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
verbose_router_logger.info("Trying to fallback b/w models")
|
verbose_router_logger.info("Trying to fallback b/w models")
|
||||||
|
|
||||||
|
# check if client-side fallbacks are used (e.g. fallbacks = ["gpt-3.5-turbo", "claude-3-haiku"] or fallbacks=[{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}]
|
||||||
|
is_non_standard_fallback_format = _check_non_standard_fallback_format(
|
||||||
|
fallbacks=fallbacks
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_non_standard_fallback_format:
|
||||||
|
input_kwargs.update(
|
||||||
|
{
|
||||||
|
"fallback_model_group": fallbacks,
|
||||||
|
"original_model_group": original_model_group,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await run_async_fallback(
|
||||||
|
*args,
|
||||||
|
**input_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
if isinstance(e, litellm.ContextWindowExceededError):
|
if isinstance(e, litellm.ContextWindowExceededError):
|
||||||
if context_window_fallbacks is not None:
|
if context_window_fallbacks is not None:
|
||||||
fallback_model_group: Optional[List[str]] = (
|
fallback_model_group: Optional[List[str]] = (
|
||||||
|
@ -2722,7 +2744,7 @@ class Router:
|
||||||
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
|
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
|
||||||
fallback_model_group, generic_fallback_idx = (
|
fallback_model_group, generic_fallback_idx = (
|
||||||
get_fallback_model_group(
|
get_fallback_model_group(
|
||||||
fallbacks=fallbacks,
|
fallbacks=fallbacks, # if fallbacks = [{"gpt-3.5-turbo": ["claude-3-haiku"]}]
|
||||||
model_group=cast(str, model_group),
|
model_group=cast(str, model_group),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_router_logger
|
from litellm._logging import verbose_router_logger
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from litellm.types.router import LiteLLMParamsTypedDict
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from litellm.router import Router as _Router
|
from litellm.router import Router as _Router
|
||||||
|
@ -67,7 +68,7 @@ def get_fallback_model_group(
|
||||||
elif list(item.keys())[0] == "*": # check generic fallback
|
elif list(item.keys())[0] == "*": # check generic fallback
|
||||||
generic_fallback_idx = idx
|
generic_fallback_idx = idx
|
||||||
elif isinstance(item, str):
|
elif isinstance(item, str):
|
||||||
fallback_model_group = [fallbacks.pop(idx)]
|
fallback_model_group = [fallbacks.pop(idx)] # returns single-item list
|
||||||
## if none, check for generic fallback
|
## if none, check for generic fallback
|
||||||
if fallback_model_group is None:
|
if fallback_model_group is None:
|
||||||
if stripped_model_fallback is not None:
|
if stripped_model_fallback is not None:
|
||||||
|
@ -122,9 +123,12 @@ async def run_async_fallback(
|
||||||
# LOGGING
|
# LOGGING
|
||||||
kwargs = litellm_router.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = litellm_router.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
verbose_router_logger.info(f"Falling back to model_group = {mg}")
|
verbose_router_logger.info(f"Falling back to model_group = {mg}")
|
||||||
kwargs["model"] = mg
|
if isinstance(mg, str):
|
||||||
|
kwargs["model"] = mg
|
||||||
|
elif isinstance(mg, dict):
|
||||||
|
kwargs.update(mg)
|
||||||
kwargs.setdefault("metadata", {}).update(
|
kwargs.setdefault("metadata", {}).update(
|
||||||
{"model_group": mg}
|
{"model_group": kwargs.get("model", None)}
|
||||||
) # update model_group used, if fallbacks are done
|
) # update model_group used, if fallbacks are done
|
||||||
kwargs["fallback_depth"] = fallback_depth + 1
|
kwargs["fallback_depth"] = fallback_depth + 1
|
||||||
kwargs["max_fallbacks"] = max_fallbacks
|
kwargs["max_fallbacks"] = max_fallbacks
|
||||||
|
@ -310,3 +314,31 @@ async def log_failure_fallback_event(
|
||||||
verbose_router_logger.error(
|
verbose_router_logger.error(
|
||||||
f"Error in log_failure_fallback_event: {str(e)}"
|
f"Error in log_failure_fallback_event: {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_non_standard_fallback_format(fallbacks: Optional[List[Any]]) -> bool:
|
||||||
|
"""
|
||||||
|
Checks if the fallbacks list is a list of strings or a list of dictionaries.
|
||||||
|
|
||||||
|
If
|
||||||
|
- List[str]: e.g. ["claude-3-haiku", "openai/o-1"]
|
||||||
|
- List[Dict[<LiteLLMParamsTypedDict>, Any]]: e.g. [{"model": "claude-3-haiku", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}]
|
||||||
|
|
||||||
|
If [{"gpt-3.5-turbo": ["claude-3-haiku"]}] then standard format.
|
||||||
|
"""
|
||||||
|
if fallbacks is None or not isinstance(fallbacks, list) or len(fallbacks) == 0:
|
||||||
|
return False
|
||||||
|
if all(isinstance(item, str) for item in fallbacks):
|
||||||
|
return True
|
||||||
|
elif all(isinstance(item, dict) for item in fallbacks):
|
||||||
|
for key in LiteLLMParamsTypedDict.__annotations__.keys():
|
||||||
|
if key in fallbacks[0].keys():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_non_standard_fallback_format(
|
||||||
|
fallbacks: Union[List[str], List[Dict[str, Any]]], model_group: str
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
|
@ -281,6 +281,7 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="causes openai rate limit errors")
|
||||||
def test_audio_speech_cost_calc():
|
def test_audio_speech_cost_calc():
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
|
||||||
|
|
|
@ -1567,3 +1567,38 @@ def test_get_fallback_model_group():
|
||||||
}
|
}
|
||||||
fallback_model_group, _ = get_fallback_model_group(**args)
|
fallback_model_group, _ = get_fallback_model_group(**args)
|
||||||
assert fallback_model_group == ["claude-3-haiku"]
|
assert fallback_model_group == ["claude-3-haiku"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fallbacks_with_different_messages():
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "claude-3-haiku",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "claude-3-haiku-20240307",
|
||||||
|
"api_key": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_testing_fallbacks=True,
|
||||||
|
fallbacks=[
|
||||||
|
{
|
||||||
|
"model": "claude-3-haiku",
|
||||||
|
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
|
@ -3922,7 +3922,7 @@ def test_unit_test_perplexity_citations_chunk():
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.flaky(retries=3, delay=1)
|
@pytest.mark.flaky(retries=3, delay=1)
|
||||||
def test_streaming_tool_calls_valid_json_str(model):
|
def test_aastreaming_tool_calls_valid_json_str(model):
|
||||||
if "vertex_ai" in model:
|
if "vertex_ai" in model:
|
||||||
from test_amazing_vertex_completion import (
|
from test_amazing_vertex_completion import (
|
||||||
load_vertex_ai_credentials,
|
load_vertex_ai_credentials,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue