mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
docs(routing.md): updating docs for managing multiple deployments
This commit is contained in:
parent
d8079907b9
commit
9e88014ff4
2 changed files with 203 additions and 16 deletions
|
@ -1,4 +1,7 @@
|
||||||
import Image from '@theme/IdealImage';
|
import Image from '@theme/IdealImage';
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
|
||||||
# Manage Multiple Deployments
|
# Manage Multiple Deployments
|
||||||
|
|
||||||
|
@ -17,15 +20,15 @@ In production, [Router connects to a Redis Cache](#redis-queue) to track usage a
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
|
|
||||||
model_list = [{ # list of model deployments
|
model_list = [{ # list of model deployments
|
||||||
"model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
"model": "azure/chatgpt-v-2",
|
"model": "azure/chatgpt-v-2", # actual model name
|
||||||
"api_key": os.getenv("AZURE_API_KEY"),
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
"api_base": os.getenv("AZURE_API_BASE")
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
"model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo",
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
"model": "azure/chatgpt-functioncalling",
|
"model": "azure/chatgpt-functioncalling",
|
||||||
"api_key": os.getenv("AZURE_API_KEY"),
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
@ -33,7 +36,7 @@ model_list = [{ # list of model deployments
|
||||||
"api_base": os.getenv("AZURE_API_BASE")
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
"model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo",
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
@ -43,28 +46,212 @@ model_list = [{ # list of model deployments
|
||||||
router = Router(model_list=model_list)
|
router = Router(model_list=model_list)
|
||||||
|
|
||||||
# openai.ChatCompletion.create replacement
|
# openai.ChatCompletion.create replacement
|
||||||
response = router.completion(model="gpt-3.5-turbo",
|
response = await router.acompletion(model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Available Endpoints
|
||||||
|
- `router.completion()` - chat completions endpoint to call 100+ LLMs
|
||||||
|
- `router.acompletion()` - async chat completion calls
|
||||||
|
- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
|
||||||
|
- `router.aembeddings()` - async embeddings endpoint
|
||||||
|
- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
|
||||||
|
|
||||||
|
### Routing Strategies
|
||||||
|
|
||||||
|
Router provides 2 strategies for routing your calls across multiple deployments:
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="simple-shuffle" label="Simple Shuffle">
|
||||||
|
|
||||||
|
**Default** Randomly picks a deployment to route a call too.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
model_list = [{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2", # actual model name
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-functioncalling",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
|
||||||
|
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
|
||||||
|
|
||||||
|
|
||||||
|
response = await router.acompletion(model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="usage-based" label="Rate-Limit Aware">
|
||||||
|
|
||||||
|
This will route to the deployment with the lowest TPM usage for that minute.
|
||||||
|
|
||||||
|
In production, we use Redis to track usage (TPM/RPM) across multiple deployments.
|
||||||
|
|
||||||
|
If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded.
|
||||||
|
|
||||||
|
For Azure, your RPM = TPM/6.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
|
||||||
|
model_list = [{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2", # actual model name
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 10000,
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-functioncalling",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 1000,
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 1000,
|
||||||
|
}]
|
||||||
|
router = Router(model_list=model_list,
|
||||||
|
redis_host=os.environ["REDIS_HOST"],
|
||||||
|
redis_password=os.environ["REDIS_PASSWORD"],
|
||||||
|
redis_port=os.environ["REDIS_PORT"],
|
||||||
|
routing_strategy="usage-based-routing")
|
||||||
|
|
||||||
|
|
||||||
|
response = await router.acompletion(model="gpt-3.5-turbo",
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Redis Queue
|
|
||||||
|
|
||||||
In production, we use Redis to track usage across multiple Azure deployments.
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
### Caching + Request Timeouts
|
||||||
|
|
||||||
|
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
|
||||||
|
|
||||||
|
**In-memory Cache + Timeouts**
|
||||||
|
|
||||||
```python
|
```python
|
||||||
router = Router(model_list=model_list,
|
router = Router(model_list=model_list,
|
||||||
redis_host=os.getenv("REDIS_HOST"),
|
cache_responses=True,
|
||||||
redis_password=os.getenv("REDIS_PASSWORD"),
|
timeout=30) # timeout set to 30s
|
||||||
redis_port=os.getenv("REDIS_PORT"))
|
|
||||||
|
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Redis Cache + Timeouts**
|
||||||
|
```python
|
||||||
|
router = Router(model_list=model_list,
|
||||||
|
redis_host=os.getenv("REDIS_HOST"),
|
||||||
|
redis_password=os.getenv("REDIS_PASSWORD"),
|
||||||
|
redis_port=os.getenv("REDIS_PORT"),
|
||||||
|
cache_responses=True,
|
||||||
|
timeout=30)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Retry failed requests
|
||||||
|
|
||||||
|
For both async + sync functions, we support retrying failed requests.
|
||||||
|
|
||||||
|
If it's a RateLimitError we implement exponential backoffs
|
||||||
|
|
||||||
|
If it's a generic OpenAI API Error, we retry immediately
|
||||||
|
|
||||||
|
For any other exception types, we don't retry
|
||||||
|
|
||||||
|
Here's a quick look at how we can set `num_retries = 3`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(model_list=model_list,
|
||||||
|
cache_responses=True,
|
||||||
|
timeout=30,
|
||||||
|
num_retries=3)
|
||||||
|
|
||||||
|
user_message = "Hello, whats the weather in San Francisco??"
|
||||||
|
messages = [{"content": user_message, "role": "user"}]
|
||||||
|
|
||||||
|
# normal call
|
||||||
|
response = router.completion(model="gpt-3.5-turbo", messages=messages)
|
||||||
|
|
||||||
|
print(f"response: {response}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Default litellm.completion/embedding params
|
||||||
|
|
||||||
|
You can also set default params for litellm completion/embedding calls. Here's how to do that:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
|
||||||
|
|
||||||
|
router = Router(model_list=model_list,
|
||||||
|
default_litellm_params={"context_window_fallback_dict": fallback_dict})
|
||||||
|
|
||||||
|
user_message = "Hello, whats the weather in San Francisco??"
|
||||||
|
messages = [{"content": user_message, "role": "user"}]
|
||||||
|
|
||||||
|
# normal call
|
||||||
|
response = router.completion(model="gpt-3.5-turbo", messages=messages)
|
||||||
|
|
||||||
|
print(f"response: {response}")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### Deploy Router
|
### Deploy Router
|
||||||
|
|
||||||
If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md)
|
If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model)
|
||||||
|
|
||||||
|
|
||||||
|
## litellm.completion()
|
||||||
|
|
||||||
|
If you're calling litellm.completion(), here's the different reliability options you can enable.
|
||||||
|
|
||||||
## Retry failed requests
|
## Retry failed requests
|
||||||
|
|
||||||
|
@ -103,7 +290,7 @@ from litellm import completion
|
||||||
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
|
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
|
||||||
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
|
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
|
||||||
|
|
||||||
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
|
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Fallbacks - Switch Models/API Keys/API Bases
|
### Fallbacks - Switch Models/API Keys/API Bases
|
||||||
|
|
|
@ -11,7 +11,7 @@ class Router:
|
||||||
Example usage:
|
Example usage:
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
model_list = [{
|
model_list = [{
|
||||||
"model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
"model": "azure/<your-deployment-name>",
|
"model": "azure/<your-deployment-name>",
|
||||||
"api_key": <your-api-key>,
|
"api_key": <your-api-key>,
|
||||||
|
@ -47,9 +47,9 @@ class Router:
|
||||||
|
|
||||||
self.chat = litellm.Chat(params=default_litellm_params)
|
self.chat = litellm.Chat(params=default_litellm_params)
|
||||||
|
|
||||||
self.default_litellm_params = {
|
self.default_litellm_params = default_litellm_params
|
||||||
"timeout": timeout
|
self.default_litellm_params["timeout"] = timeout
|
||||||
}
|
|
||||||
self.routing_strategy = routing_strategy
|
self.routing_strategy = routing_strategy
|
||||||
### HEALTH CHECK THREAD ###
|
### HEALTH CHECK THREAD ###
|
||||||
if self.routing_strategy == "least-busy":
|
if self.routing_strategy == "least-busy":
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue