Litellm dev 02 18 2025 p2 (#8639)

* fix(parallel_request_limiter.py): improve single instance rate limiting by updating in-memory cache instantly Fixes issue where parallel request limiter had a leak * fix(parallel_request_limiter.py): fix parallel request limiter to not decrement val on max limit being reached * test(test_parallel_request_limiter.py): fix test * test: fix test * fix(parallel_request_limiter.py): move to using common enum * test: fix test
2025-04-27 03:34:10 +00:00 · 2025-02-18 19:12:16 -08:00 · 2025-02-18 19:12:16 -08:00 · bf6c013de0
commit bf6c013de0
parent c088442658
3 changed files with 25 additions and 31 deletions
--- a/tests/local_testing/test_parallel_request_limiter.py
+++ b/tests/local_testing/test_parallel_request_limiter.py
@ -146,7 +146,7 @@ async def test_pre_call_hook_rpm_limits():
    _api_key = "sk-12345"
    _api_key = hash_token(_api_key)
    user_api_key_dict = UserAPIKeyAuth(
-        api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=1
+        api_key=_api_key, max_parallel_requests=10, tpm_limit=9, rpm_limit=1
    )
    local_cache = DualCache()
    parallel_request_handler = MaxParallelRequestsHandler(
@ -157,16 +157,6 @@ async def test_pre_call_hook_rpm_limits():
        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
    )

-    kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
-
-    ## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
-    await parallel_request_handler.async_pre_call_hook(
-        user_api_key_dict=user_api_key_dict,
-        cache=local_cache,
-        data={},
-        call_type="",
-    )
-
    await asyncio.sleep(2)

    try:
@ -202,15 +192,6 @@ async def test_pre_call_hook_rpm_limits_retry_after():
        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
    )

-    kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
-
-    await parallel_request_handler.async_pre_call_hook(
-        user_api_key_dict=user_api_key_dict,
-        cache=local_cache,
-        data={},
-        call_type="",
-    )
-
    await asyncio.sleep(2)

    ## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
@ -261,13 +242,6 @@ async def test_pre_call_hook_team_rpm_limits():
        }
    }

-    await parallel_request_handler.async_pre_call_hook(
-        user_api_key_dict=user_api_key_dict,
-        cache=local_cache,
-        data={},
-        call_type="",
-    )
-
    await asyncio.sleep(2)

    ## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}