diff --git a/litellm/router.py b/litellm/router.py index e19afc3c0..e1f7efb43 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -2380,6 +2380,13 @@ class Router: _context_window_error = False _rate_limit_error = False + + ## get model group RPM ## + current_minute = datetime.now().strftime("%H-%M") + rpm_key = f"{model}:rpm:{current_minute}" + model_group_cache = ( + self.cache.get_cache(key=rpm_key, local_only=True) or {} + ) # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache. for idx, deployment in enumerate(_returned_deployments): # see if we have the info for this model try: @@ -2414,12 +2421,7 @@ class Router: current_request_cache_local = ( self.cache.get_cache(key=model_id, local_only=True) or 0 ) - ### get usage-based routing cache ### - current_minute = datetime.now().strftime("%H-%M") - rpm_key = f"{deployment['model_name']}:rpm:{current_minute}" - model_group_cache = ( - self.cache.get_cache(key=rpm_key, local_only=True) or {} - ) # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache. + ### get usage based cache ### model_group_cache[model_id] = model_group_cache.get(model_id, 0) current_request = max(