mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
Merge pull request #2388 from BerriAI/litellm_docs_show_how_to_set_lb_config
(docs) setting load balancing config
This commit is contained in:
commit
7bb515a75b
1 changed files with 27 additions and 4 deletions
|
@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml
|
|||
|
||||
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
|
||||
For optimal performance:
|
||||
- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
|
||||
- Select your optimal routing strategy in `router_settings:routing_strategy`.
|
||||
|
||||
LiteLLM supports
|
||||
```python
|
||||
["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
|
||||
```
|
||||
|
||||
When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
|
||||
- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8001
|
||||
rpm: 60 # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
|
||||
tpm: 1000 # Optional[int]: tpm = Tokens Per Minute
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8002
|
||||
rpm: 600
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8003
|
||||
rpm: 60000
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
api_key: <my-openai-key>
|
||||
rpm: 200
|
||||
- model_name: gpt-3.5-turbo-16k
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo-16k
|
||||
api_key: <my-openai-key>
|
||||
rpm: 100
|
||||
|
||||
litellm_settings:
|
||||
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||
|
@ -280,8 +295,16 @@ litellm_settings:
|
|||
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||
```
|
||||
|
||||
router_settings: # router_settings are optional
|
||||
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||
num_retries: 2
|
||||
timeout: 30 # 30 seconds
|
||||
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||
redis_password: <your redis password>
|
||||
redis_port: 1992
|
||||
```
|
||||
|
||||
## Set Azure `base_model` for cost tracking
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue