From 9430301ae47a72f0602a030cb31015febc6108a9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 7 Mar 2024 12:22:39 -0800
Subject: [PATCH] (docs) setting load balancing config

---
 docs/my-website/docs/proxy/configs.md | 31 +++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)
diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 2b3edfadb9..5745685de8 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml
 
 Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). 
 
-```yaml
-router_settings:
-  routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
+For optimal performance:
+- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
+- Select your optimal routing strategy in `router_settings:routing_strategy`. 
 
+LiteLLM supports
+```python
+["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
+```
+
+When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
+- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
+
+```yaml
 model_list:
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8001
+        rpm: 60      # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
+        tpm: 1000   # Optional[int]: tpm = Tokens Per Minute 
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8002
+        rpm: 600      
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8003
+        rpm: 60000      
   - model_name: gpt-3.5-turbo
     litellm_params:
         model: gpt-3.5-turbo
         api_key: <my-openai-key>
+        rpm: 200      
   - model_name: gpt-3.5-turbo-16k
     litellm_params:
         model: gpt-3.5-turbo-16k
         api_key: <my-openai-key>
+        rpm: 100      
 
 litellm_settings:
   num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
@@ -280,8 +295,16 @@ litellm_settings:
   fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
   context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
   allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
-```
 
+router_settings: # router_settings are optional
+  routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
+  num_retries: 2
+  timeout: 30                                  # 30 seconds
+  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
+  redis_password: <your redis password>
+  redis_port: 1992
+```
 
 ## Set Azure `base_model` for cost tracking