[Perf Proxy] parallel request limiter - use one cache update call (#5932)

* fix parallel request limiter - use one cache update call * ci/cd run again * run ci/cd again * use docker username password * fix config.yml * fix config * fix config * fix config.yml * ci/cd run again * use correct typing for batch set cache * fix async_set_cache_pipeline * fix only check user id tpm / rpm limits when limits set * fix test_openai_azure_embedding_with_oidc_and_cf
2025-04-25 18:54:30 +00:00 · 2024-09-27 10:26:15 -07:00 · 2024-09-27 10:26:15 -07:00 · f4613a100d
commit f4613a100d
parent 71f68ac185
7 changed files with 56 additions and 36 deletions
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -327,8 +327,13 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                user_api_key_dict=user_api_key_dict,
            )
            # get user tpm/rpm limits
-            if _user_id_rate_limits is not None and isinstance(
-                _user_id_rate_limits, dict
+            if (
+                _user_id_rate_limits is not None
+                and isinstance(_user_id_rate_limits, dict)
+                and (
+                    _user_id_rate_limits.get("tpm_limit", None) is not None
+                    or _user_id_rate_limits.get("rpm_limit", None) is not None
+                )
            ):
                user_tpm_limit = _user_id_rate_limits.get("tpm_limit", None)
                user_rpm_limit = _user_id_rate_limits.get("rpm_limit", None)
@ -472,6 +477,8 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            # Update usage - API Key
            # ------------

+            values_to_update_in_cache = []
+
            if user_api_key is not None:
                request_count_api_key = (
                    f"{user_api_key}::{precise_minute}::request_count"
@ -495,12 +502,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
-                    request_count_api_key,
-                    new_val,
-                    ttl=60,
-                    litellm_parent_otel_span=litellm_parent_otel_span,
-                )  # store in cache for 1 min.
+                values_to_update_in_cache.append((request_count_api_key, new_val))

            # ------------
            # Update usage - model group + API Key
@ -536,12 +538,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
-                    request_count_api_key,
-                    new_val,
-                    ttl=60,
-                    litellm_parent_otel_span=litellm_parent_otel_span,
-                )
+                values_to_update_in_cache.append((request_count_api_key, new_val))

            # ------------
            # Update usage - User
@ -574,12 +571,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
-                    request_count_api_key,
-                    new_val,
-                    ttl=60,
-                    litellm_parent_otel_span=litellm_parent_otel_span,
-                )  # store in cache for 1 min.
+                values_to_update_in_cache.append((request_count_api_key, new_val))

            # ------------
            # Update usage - Team
@ -612,12 +604,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
-                    request_count_api_key,
-                    new_val,
-                    ttl=60,
-                    litellm_parent_otel_span=litellm_parent_otel_span,
-                )  # store in cache for 1 min.
+                values_to_update_in_cache.append((request_count_api_key, new_val))

            # ------------
            # Update usage - End User
@ -650,13 +637,13 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
-                    request_count_api_key,
-                    new_val,
-                    ttl=60,
-                    litellm_parent_otel_span=litellm_parent_otel_span,
-                )  # store in cache for 1 min.
+                values_to_update_in_cache.append((request_count_api_key, new_val))

+            await self.internal_usage_cache.async_batch_set_cache(
+                cache_list=values_to_update_in_cache,
+                ttl=60,
+                litellm_parent_otel_span=litellm_parent_otel_span,
+            )
        except Exception as e:
            self.print_verbose(e)  # noqa