From fbfb86f1e64cdc3f802c0a732316f3a136261a3b Mon Sep 17 00:00:00 2001 From: Daniel Hnyk Date: Mon, 10 Mar 2025 10:24:29 +0100 Subject: [PATCH] explain better behavior of usage-based-routing-v2 --- docs/my-website/docs/routing.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 0ad28b24f4..a5ef24a3f2 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -163,9 +163,9 @@ Router provides 4 strategies for routing your calls across multiple deployments: **Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits. -Routes to **deployment with lowest TPM usage** for that minute. +Routes to **deployment with lowest TPM usage** for that minute. If two deployments have the same usage, it chooses randomly. This does not automatically favor a higher-limit deployment up front—but if usage spikes, the smaller-limit deployment may hit its cap and get excluded, so the bigger-limit one will remain in the pool longer. -In production, we use Redis to track usage (TPM/RPM) across multiple deployments. This implementation uses **async redis calls** (redis.incr and redis.mget). +In production, we use Redis to track usage (TPM/RPM) across multiple deployments. This implementation uses **async redis calls** (`redis.incr` and `redis.mget`). For Azure, [you get 6 RPM per 1000 TPM](https://stackoverflow.com/questions/77368844/what-is-the-request-per-minute-rate-limit-for-azure-openai-models-for-gpt-3-5-tu) @@ -1639,4 +1639,4 @@ class RouterGeneralSettings(BaseModel): pass_through_all_models: bool = Field( default=False ) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding -``` \ No newline at end of file +```