mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
Merge pull request #2388 from BerriAI/litellm_docs_show_how_to_set_lb_config
(docs) setting load balancing config
This commit is contained in:
commit
7bb515a75b
1 changed files with 27 additions and 4 deletions
|
@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
||||||
|
|
||||||
```yaml
|
For optimal performance:
|
||||||
router_settings:
|
- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
|
||||||
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
|
- Select your optimal routing strategy in `router_settings:routing_strategy`.
|
||||||
|
|
||||||
|
LiteLLM supports
|
||||||
|
```python
|
||||||
|
["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
|
||||||
|
```
|
||||||
|
|
||||||
|
When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
|
||||||
|
- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
|
||||||
|
|
||||||
|
```yaml
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: zephyr-beta
|
- model_name: zephyr-beta
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
api_base: http://0.0.0.0:8001
|
api_base: http://0.0.0.0:8001
|
||||||
|
rpm: 60 # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
|
||||||
|
tpm: 1000 # Optional[int]: tpm = Tokens Per Minute
|
||||||
- model_name: zephyr-beta
|
- model_name: zephyr-beta
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
api_base: http://0.0.0.0:8002
|
api_base: http://0.0.0.0:8002
|
||||||
|
rpm: 600
|
||||||
- model_name: zephyr-beta
|
- model_name: zephyr-beta
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
api_base: http://0.0.0.0:8003
|
api_base: http://0.0.0.0:8003
|
||||||
|
rpm: 60000
|
||||||
- model_name: gpt-3.5-turbo
|
- model_name: gpt-3.5-turbo
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo
|
model: gpt-3.5-turbo
|
||||||
api_key: <my-openai-key>
|
api_key: <my-openai-key>
|
||||||
|
rpm: 200
|
||||||
- model_name: gpt-3.5-turbo-16k
|
- model_name: gpt-3.5-turbo-16k
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo-16k
|
model: gpt-3.5-turbo-16k
|
||||||
api_key: <my-openai-key>
|
api_key: <my-openai-key>
|
||||||
|
rpm: 100
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||||
|
@ -280,8 +295,16 @@ litellm_settings:
|
||||||
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||||
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||||
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||||
```
|
|
||||||
|
|
||||||
|
router_settings: # router_settings are optional
|
||||||
|
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||||
|
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||||
|
num_retries: 2
|
||||||
|
timeout: 30 # 30 seconds
|
||||||
|
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||||
|
redis_password: <your redis password>
|
||||||
|
redis_port: 1992
|
||||||
|
```
|
||||||
|
|
||||||
## Set Azure `base_model` for cost tracking
|
## Set Azure `base_model` for cost tracking
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue