Litellm dev 02 18 2025 p2 (#8639)

* fix(parallel_request_limiter.py): improve single instance rate limiting by updating in-memory cache instantly

Fixes issue where parallel request limiter had a leak

* fix(parallel_request_limiter.py): fix parallel request limiter to not decrement val on max limit being reached

* test(test_parallel_request_limiter.py): fix test

* test: fix test

* fix(parallel_request_limiter.py): move to using common enum

* test: fix test
This commit is contained in:
Krish Dholakia 2025-02-18 19:12:16 -08:00 committed by GitHub
parent c088442658
commit bf6c013de0
3 changed files with 25 additions and 31 deletions

View file

@ -146,7 +146,7 @@ async def test_pre_call_hook_rpm_limits():
_api_key = "sk-12345"
_api_key = hash_token(_api_key)
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=1
api_key=_api_key, max_parallel_requests=10, tpm_limit=9, rpm_limit=1
)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler(
@ -157,16 +157,6 @@ async def test_pre_call_hook_rpm_limits():
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
await asyncio.sleep(2)
try:
@ -202,15 +192,6 @@ async def test_pre_call_hook_rpm_limits_retry_after():
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
await asyncio.sleep(2)
## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
@ -261,13 +242,6 @@ async def test_pre_call_hook_team_rpm_limits():
}
}
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
await asyncio.sleep(2)
## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}