mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
LiteLLM Minor Fixes & Improvements (12/23/2024) - P2 (#7386)
* fix(main.py): support 'mock_timeout=true' param allows mock requests on proxy to have a time delay, for testing * fix(main.py): ensure mock timeouts raise litellm.Timeout error triggers retry/fallbacks * fix: fix fallback + mock timeout testing * fix(router.py): always return remaining tpm/rpm limits, if limits are known allows for rate limit headers to be guaranteed * docs(timeout.md): add docs on mock timeout = true * fix(main.py): fix linting errors * test: fix test
This commit is contained in:
parent
db59e08958
commit
48316520f4
7 changed files with 223 additions and 54 deletions
|
@ -2613,6 +2613,8 @@ class Router:
|
|||
"content_policy_fallbacks", self.content_policy_fallbacks
|
||||
)
|
||||
|
||||
mock_timeout = kwargs.pop("mock_timeout", None)
|
||||
|
||||
try:
|
||||
self._handle_mock_testing_fallbacks(
|
||||
kwargs=kwargs,
|
||||
|
@ -2622,7 +2624,9 @@ class Router:
|
|||
content_policy_fallbacks=content_policy_fallbacks,
|
||||
)
|
||||
|
||||
response = await self.async_function_with_retries(*args, **kwargs)
|
||||
response = await self.async_function_with_retries(
|
||||
*args, **kwargs, mock_timeout=mock_timeout
|
||||
)
|
||||
verbose_router_logger.debug(f"Async Response: {response}")
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -2993,7 +2997,9 @@ class Router:
|
|||
if inspect.iscoroutinefunction(response) or inspect.isawaitable(response):
|
||||
response = await response
|
||||
## PROCESS RESPONSE HEADERS
|
||||
await self.set_response_headers(response=response, model_group=model_group)
|
||||
response = await self.set_response_headers(
|
||||
response=response, model_group=model_group
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
|
@ -4567,11 +4573,15 @@ class Router:
|
|||
rpm_limit = None
|
||||
|
||||
returned_dict = {}
|
||||
if tpm_limit is not None and current_tpm is not None:
|
||||
returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - current_tpm
|
||||
if tpm_limit is not None:
|
||||
returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - (
|
||||
current_tpm or 0
|
||||
)
|
||||
returned_dict["x-ratelimit-limit-tokens"] = tpm_limit
|
||||
if rpm_limit is not None and current_rpm is not None:
|
||||
returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - current_rpm
|
||||
if rpm_limit is not None:
|
||||
returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - (
|
||||
current_rpm or 0
|
||||
)
|
||||
returned_dict["x-ratelimit-limit-requests"] = rpm_limit
|
||||
|
||||
return returned_dict
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue