fix(router.py): improve pre-call check -> get model group cache one-time

2024-04-06 18:24:23 -07:00 · 2024-04-06 18:24:23 -07:00 · 6f94f3d127
commit 6f94f3d127
parent 4512510d37
1 changed files with 8 additions and 6 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2380,6 +2380,13 @@ class Router:

        _context_window_error = False
        _rate_limit_error = False
+
+        ## get model group RPM ##
+        current_minute = datetime.now().strftime("%H-%M")
+        rpm_key = f"{model}:rpm:{current_minute}"
+        model_group_cache = (
+            self.cache.get_cache(key=rpm_key, local_only=True) or {}
+        )  # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
        for idx, deployment in enumerate(_returned_deployments):
            # see if we have the info for this model
            try:
@ -2414,12 +2421,7 @@ class Router:
            current_request_cache_local = (
                self.cache.get_cache(key=model_id, local_only=True) or 0
            )
-            ### get usage-based routing cache ###
-            current_minute = datetime.now().strftime("%H-%M")
-            rpm_key = f"{deployment['model_name']}:rpm:{current_minute}"
-            model_group_cache = (
-                self.cache.get_cache(key=rpm_key, local_only=True) or {}
-            )  # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
+            ### get usage based cache ###
            model_group_cache[model_id] = model_group_cache.get(model_id, 0)

            current_request = max(