[Perf Proxy] parallel request limiter - use one cache update call (#5932)

* fix parallel request limiter - use one cache update call * ci/cd run again * run ci/cd again * use docker username password * fix config.yml * fix config * fix config * fix config.yml * ci/cd run again * use correct typing for batch set cache * fix async_set_cache_pipeline * fix only check user id tpm / rpm limits when limits set * fix test_openai_azure_embedding_with_oidc_and_cf
2024-09-27 10:26:15 -07:00 · 2024-09-27 10:26:15 -07:00 · f4613a100d
commit f4613a100d
parent 71f68ac185
7 changed files with 56 additions and 36 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -3,6 +3,9 @@ jobs:
  local_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
@ -114,6 +117,9 @@ jobs:
  ui_endpoint_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
@ -152,6 +158,9 @@ jobs:
  litellm_router_testing: # Runs all tests with the "router" keyword
    docker:
        - image: cimg/python:3.11
          auth:
            username: ${DOCKERHUB_USERNAME}
            password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
@ -179,6 +188,9 @@ jobs:
  litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword
    docker:
        - image: cimg/python:3.11
          auth:
            username: ${DOCKERHUB_USERNAME}
            password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
@ -206,6 +218,9 @@ jobs:
  load_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
@ -233,6 +248,9 @@ jobs:
  llm_translation_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -123,7 +123,7 @@ class InMemoryCache(BaseCache):
    async def async_set_cache(self, key, value, **kwargs):
        self.set_cache(key=key, value=value, **kwargs)
-    async def async_set_cache_pipeline(self, cache_list, ttl=None):
+    async def async_set_cache_pipeline(self, cache_list, ttl=None, **kwargs):
        for cache_key, cache_value in cache_list:
            if ttl is not None:
                self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
@ -2038,7 +2038,7 @@ class DualCache(BaseCache):
            if self.redis_cache is not None and local_only == False:
                await self.redis_cache.async_set_cache_pipeline(
-                    cache_list=cache_list, ttl=kwargs.get("ttl", None), **kwargs
+                    cache_list=cache_list, ttl=kwargs.pop("ttl", None), **kwargs
                )
        except Exception as e:
            verbose_logger.exception(
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -327,8 +327,13 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                user_api_key_dict=user_api_key_dict,
            )
            # get user tpm/rpm limits
-            if _user_id_rate_limits is not None and isinstance(
+            if (
-                _user_id_rate_limits, dict
+                _user_id_rate_limits is not None
                and isinstance(_user_id_rate_limits, dict)
                and (
                    _user_id_rate_limits.get("tpm_limit", None) is not None
                    or _user_id_rate_limits.get("rpm_limit", None) is not None
                )
            ):
                user_tpm_limit = _user_id_rate_limits.get("tpm_limit", None)
                user_rpm_limit = _user_id_rate_limits.get("rpm_limit", None)
@ -472,6 +477,8 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            # Update usage - API Key
            # ------------
            values_to_update_in_cache = []
            if user_api_key is not None:
                request_count_api_key = (
                    f"{user_api_key}::{precise_minute}::request_count"
@ -495,12 +502,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
+                values_to_update_in_cache.append((request_count_api_key, new_val))
                    request_count_api_key,
                    new_val,
                    ttl=60,
                    litellm_parent_otel_span=litellm_parent_otel_span,
                )  # store in cache for 1 min.
            # ------------
            # Update usage - model group + API Key
@ -536,12 +538,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
+                values_to_update_in_cache.append((request_count_api_key, new_val))
                    request_count_api_key,
                    new_val,
                    ttl=60,
                    litellm_parent_otel_span=litellm_parent_otel_span,
                )
            # ------------
            # Update usage - User
@ -574,12 +571,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
+                values_to_update_in_cache.append((request_count_api_key, new_val))
                    request_count_api_key,
                    new_val,
                    ttl=60,
                    litellm_parent_otel_span=litellm_parent_otel_span,
                )  # store in cache for 1 min.
            # ------------
            # Update usage - Team
@ -612,12 +604,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
+                values_to_update_in_cache.append((request_count_api_key, new_val))
                    request_count_api_key,
                    new_val,
                    ttl=60,
                    litellm_parent_otel_span=litellm_parent_otel_span,
                )  # store in cache for 1 min.
            # ------------
            # Update usage - End User
@ -650,13 +637,13 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                self.print_verbose(
                    f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
                )
-                await self.internal_usage_cache.async_set_cache(
+                values_to_update_in_cache.append((request_count_api_key, new_val))
-                    request_count_api_key,
+
-                    new_val,
+            await self.internal_usage_cache.async_batch_set_cache(
                cache_list=values_to_update_in_cache,
                ttl=60,
                litellm_parent_otel_span=litellm_parent_otel_span,
-                )  # store in cache for 1 min.
+            )
        except Exception as e:
            self.print_verbose(e)  # noqa
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,5 +1,5 @@
 model_list:
- - model_name: gpt-3.5-turbo
+ - model_name: db-openai-endpoint
   litellm_params:
     model: openai/gpt-3.5-turbo
     api_key: fake-key
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -242,6 +242,20 @@ class InternalUsageCache:
            **kwargs,
        )
    async def async_batch_set_cache(
        self,
        cache_list: List,
        litellm_parent_otel_span: Union[Span, None],
        local_only: bool = False,
        **kwargs,
    ) -> None:
        return await self.dual_cache.async_batch_set_cache(
            cache_list=cache_list,
            local_only=local_only,
            litellm_parent_otel_span=litellm_parent_otel_span,
            **kwargs,
        )
    async def async_increment_cache(
        self,
        key,
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -316,6 +316,7 @@ def test_openai_azure_embedding():
    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
    reason="Cannot run without being in CircleCI Runner",
 )
@pytest.mark.skip(reason="Azure east us 2 has a temp outage")
 def test_openai_azure_embedding_with_oidc_and_cf():
    # TODO: Switch to our own Azure account, currently using ai.moda's account
    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"