LiteLLM Minor Fixes & Improvements (12/23/2024) - P2 (#7386)

* fix(main.py): support 'mock_timeout=true' param

allows mock requests on proxy to have a time delay, for testing

* fix(main.py): ensure mock timeouts raise litellm.Timeout error

triggers retry/fallbacks

* fix: fix fallback + mock timeout testing

* fix(router.py): always return remaining tpm/rpm limits, if limits are known

allows for rate limit headers to be guaranteed

* docs(timeout.md): add docs on mock timeout = true

* fix(main.py): fix linting errors

* test: fix test
This commit is contained in:
Krish Dholakia 2024-12-23 17:41:27 -08:00 committed by GitHub
parent db59e08958
commit 48316520f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 223 additions and 54 deletions

View file

@ -2613,6 +2613,8 @@ class Router:
"content_policy_fallbacks", self.content_policy_fallbacks
)
mock_timeout = kwargs.pop("mock_timeout", None)
try:
self._handle_mock_testing_fallbacks(
kwargs=kwargs,
@ -2622,7 +2624,9 @@ class Router:
content_policy_fallbacks=content_policy_fallbacks,
)
response = await self.async_function_with_retries(*args, **kwargs)
response = await self.async_function_with_retries(
*args, **kwargs, mock_timeout=mock_timeout
)
verbose_router_logger.debug(f"Async Response: {response}")
return response
except Exception as e:
@ -2993,7 +2997,9 @@ class Router:
if inspect.iscoroutinefunction(response) or inspect.isawaitable(response):
response = await response
## PROCESS RESPONSE HEADERS
await self.set_response_headers(response=response, model_group=model_group)
response = await self.set_response_headers(
response=response, model_group=model_group
)
return response
@ -4567,11 +4573,15 @@ class Router:
rpm_limit = None
returned_dict = {}
if tpm_limit is not None and current_tpm is not None:
returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - current_tpm
if tpm_limit is not None:
returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - (
current_tpm or 0
)
returned_dict["x-ratelimit-limit-tokens"] = tpm_limit
if rpm_limit is not None and current_rpm is not None:
returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - current_rpm
if rpm_limit is not None:
returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - (
current_rpm or 0
)
returned_dict["x-ratelimit-limit-requests"] = rpm_limit
return returned_dict