From 7ae6432f94dc7d3cf7ce5da6bf6e9c89f51b010a Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 6 Apr 2024 18:19:02 -0700 Subject: [PATCH] fix(router.py): check usage based routing cache in pre-call check allows pre-call rpm check to work across instances --- litellm/router.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index b0e6a2635..e19afc3c0 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -2409,8 +2409,22 @@ class Router: ## RPM CHECK ## _litellm_params = deployment.get("litellm_params", {}) - _model_id = deployment.get("model_info", {}).get("id", "") - _deployment_rpm = self.cache.get_cache(key=_model_id, local_only=True) + model_id = deployment.get("model_info", {}).get("id", "") + ### get local router cache ### + current_request_cache_local = ( + self.cache.get_cache(key=model_id, local_only=True) or 0 + ) + ### get usage-based routing cache ### + current_minute = datetime.now().strftime("%H-%M") + rpm_key = f"{deployment['model_name']}:rpm:{current_minute}" + model_group_cache = ( + self.cache.get_cache(key=rpm_key, local_only=True) or {} + ) # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache. + model_group_cache[model_id] = model_group_cache.get(model_id, 0) + + current_request = max( + current_request_cache_local, model_group_cache[model_id] + ) if ( isinstance(_litellm_params, dict) @@ -2418,8 +2432,7 @@ class Router: ): if ( isinstance(_litellm_params["rpm"], int) - and _deployment_rpm is not None - and _litellm_params["rpm"] <= _deployment_rpm + and _litellm_params["rpm"] <= current_request ): invalid_model_indices.append(idx) _rate_limit_error = True