From 9430301ae47a72f0602a030cb31015febc6108a9 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 7 Mar 2024 12:22:39 -0800 Subject: [PATCH] (docs) setting load balancing config --- docs/my-website/docs/proxy/configs.md | 31 +++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 2b3edfadb9..5745685de8 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). -```yaml -router_settings: - routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group +For optimal performance: +- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm. +- Select your optimal routing strategy in `router_settings:routing_strategy`. +LiteLLM supports +```python +["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"` +``` + +When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput** +- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc + +```yaml model_list: - model_name: zephyr-beta litellm_params: model: huggingface/HuggingFaceH4/zephyr-7b-beta api_base: http://0.0.0.0:8001 + rpm: 60 # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm). + tpm: 1000 # Optional[int]: tpm = Tokens Per Minute - model_name: zephyr-beta litellm_params: model: huggingface/HuggingFaceH4/zephyr-7b-beta api_base: http://0.0.0.0:8002 + rpm: 600 - model_name: zephyr-beta litellm_params: model: huggingface/HuggingFaceH4/zephyr-7b-beta api_base: http://0.0.0.0:8003 + rpm: 60000 - model_name: gpt-3.5-turbo litellm_params: model: gpt-3.5-turbo api_key: + rpm: 200 - model_name: gpt-3.5-turbo-16k litellm_params: model: gpt-3.5-turbo-16k api_key: + rpm: 100 litellm_settings: num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) @@ -280,8 +295,16 @@ litellm_settings: fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. -``` +router_settings: # router_settings are optional + routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo` + num_retries: 2 + timeout: 30 # 30 seconds + redis_host: # set this when using multiple litellm proxy deployments, load balancing state stored in redis + redis_password: + redis_port: 1992 +``` ## Set Azure `base_model` for cost tracking