mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
Merge pull request #3954 from BerriAI/litellm_simple_request_prioritization
feat(scheduler.py): add request prioritization scheduler
This commit is contained in:
commit
8375e9621c
12 changed files with 612 additions and 149 deletions
|
@ -641,7 +641,6 @@ class Router:
|
|||
kwargs=kwargs,
|
||||
client_type="max_parallel_requests",
|
||||
)
|
||||
|
||||
if rpm_semaphore is not None and isinstance(
|
||||
rpm_semaphore, asyncio.Semaphore
|
||||
):
|
||||
|
@ -1987,6 +1986,7 @@ class Router:
|
|||
error=e,
|
||||
healthy_deployments=_healthy_deployments,
|
||||
context_window_fallbacks=context_window_fallbacks,
|
||||
regular_fallbacks=fallbacks,
|
||||
)
|
||||
|
||||
# decides how long to sleep before retry
|
||||
|
@ -1996,7 +1996,6 @@ class Router:
|
|||
num_retries=num_retries,
|
||||
healthy_deployments=_healthy_deployments,
|
||||
)
|
||||
|
||||
# sleeps for the length of the timeout
|
||||
await asyncio.sleep(_timeout)
|
||||
|
||||
|
@ -2041,6 +2040,7 @@ class Router:
|
|||
healthy_deployments=_healthy_deployments,
|
||||
)
|
||||
await asyncio.sleep(_timeout)
|
||||
|
||||
try:
|
||||
cooldown_deployments = await self._async_get_cooldown_deployments()
|
||||
original_exception.message += f"\nNumber Retries = {current_attempt + 1}, Max Retries={num_retries}\nCooldown Deployments={cooldown_deployments}"
|
||||
|
@ -2053,6 +2053,7 @@ class Router:
|
|||
error: Exception,
|
||||
healthy_deployments: Optional[List] = None,
|
||||
context_window_fallbacks: Optional[List] = None,
|
||||
regular_fallbacks: Optional[List] = None,
|
||||
):
|
||||
"""
|
||||
1. raise an exception for ContextWindowExceededError if context_window_fallbacks is not None
|
||||
|
@ -2069,7 +2070,7 @@ class Router:
|
|||
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR w/ fallbacks available / Bad Request Error
|
||||
if (
|
||||
isinstance(error, litellm.ContextWindowExceededError)
|
||||
and context_window_fallbacks is None
|
||||
and context_window_fallbacks is not None
|
||||
):
|
||||
raise error
|
||||
|
||||
|
@ -2077,7 +2078,11 @@ class Router:
|
|||
if isinstance(error, openai.RateLimitError) or isinstance(
|
||||
error, openai.AuthenticationError
|
||||
):
|
||||
if _num_healthy_deployments <= 0:
|
||||
if (
|
||||
_num_healthy_deployments <= 0
|
||||
and regular_fallbacks is not None
|
||||
and len(regular_fallbacks) > 0
|
||||
):
|
||||
raise error
|
||||
|
||||
return True
|
||||
|
@ -2252,6 +2257,7 @@ class Router:
|
|||
error=e,
|
||||
healthy_deployments=_healthy_deployments,
|
||||
context_window_fallbacks=context_window_fallbacks,
|
||||
regular_fallbacks=fallbacks,
|
||||
)
|
||||
|
||||
# decides how long to sleep before retry
|
||||
|
@ -2460,7 +2466,7 @@ class Router:
|
|||
|
||||
the exception is not one that should be immediately retried (e.g. 401)
|
||||
"""
|
||||
args = locals()
|
||||
|
||||
if deployment is None:
|
||||
return
|
||||
|
||||
|
@ -2631,7 +2637,17 @@ class Router:
|
|||
"""
|
||||
for _callback in litellm.callbacks:
|
||||
if isinstance(_callback, CustomLogger):
|
||||
response = await _callback.async_pre_call_check(deployment)
|
||||
try:
|
||||
response = await _callback.async_pre_call_check(deployment)
|
||||
except litellm.RateLimitError as e:
|
||||
self._set_cooldown_deployments(
|
||||
exception_status=e.status_code,
|
||||
deployment=deployment["model_info"]["id"],
|
||||
time_to_cooldown=self.cooldown_time,
|
||||
)
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def set_client(self, model: dict):
|
||||
"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue