forked from phoenix/litellm-mirror
Merge pull request #1534 from BerriAI/litellm_custom_cooldown_times
[Feat] Litellm.Router set custom cooldown times
This commit is contained in:
commit
435d4b9279
3 changed files with 138 additions and 6 deletions
|
@ -603,10 +603,11 @@ def __init__(
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||||
fallbacks: List = [],
|
fallbacks: List = [],
|
||||||
allowed_fails: Optional[int] = None,
|
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||||
|
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||||
context_window_fallbacks: List = [],
|
context_window_fallbacks: List = [],
|
||||||
model_group_alias: Optional[dict] = {},
|
model_group_alias: Optional[dict] = {},
|
||||||
retry_after: int = 0, # min time to wait before retrying a failed request
|
retry_after: int = 0, # (min) time to wait before retrying a failed request
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
|
|
|
@ -96,10 +96,13 @@ class Router:
|
||||||
set_verbose: bool = False,
|
set_verbose: bool = False,
|
||||||
debug_level: Literal["DEBUG", "INFO"] = "INFO",
|
debug_level: Literal["DEBUG", "INFO"] = "INFO",
|
||||||
fallbacks: List = [],
|
fallbacks: List = [],
|
||||||
allowed_fails: Optional[int] = None,
|
|
||||||
context_window_fallbacks: List = [],
|
context_window_fallbacks: List = [],
|
||||||
model_group_alias: Optional[dict] = {},
|
model_group_alias: Optional[dict] = {},
|
||||||
retry_after: int = 0, # min time to wait before retrying a failed request
|
retry_after: int = 0, # min time to wait before retrying a failed request
|
||||||
|
allowed_fails: Optional[
|
||||||
|
int
|
||||||
|
] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||||
|
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
|
@ -108,6 +111,36 @@ class Router:
|
||||||
] = "simple-shuffle",
|
] = "simple-shuffle",
|
||||||
routing_strategy_args: dict = {}, # just for latency-based routing
|
routing_strategy_args: dict = {}, # just for latency-based routing
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_list (Optional[list]): List of models to be used. Defaults to None.
|
||||||
|
redis_url (Optional[str]): URL of the Redis server. Defaults to None.
|
||||||
|
redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
|
||||||
|
redis_port (Optional[int]): Port of the Redis server. Defaults to None.
|
||||||
|
redis_password (Optional[str]): Password of the Redis server. Defaults to None.
|
||||||
|
cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
|
||||||
|
cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
|
||||||
|
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
|
||||||
|
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
|
||||||
|
num_retries (int): Number of retries for failed requests. Defaults to 0.
|
||||||
|
timeout (Optional[float]): Timeout for requests. Defaults to None.
|
||||||
|
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
|
||||||
|
set_verbose (bool): Flag to set verbose mode. Defaults to False.
|
||||||
|
debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
|
||||||
|
fallbacks (List): List of fallback options. Defaults to [].
|
||||||
|
context_window_fallbacks (List): List of context window fallback options. Defaults to [].
|
||||||
|
model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
|
||||||
|
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
|
||||||
|
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
|
||||||
|
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
|
||||||
|
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
|
||||||
|
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Router: An instance of the litellm.Router class.
|
||||||
|
"""
|
||||||
self.set_verbose = set_verbose
|
self.set_verbose = set_verbose
|
||||||
if self.set_verbose:
|
if self.set_verbose:
|
||||||
if debug_level == "INFO":
|
if debug_level == "INFO":
|
||||||
|
@ -163,6 +196,7 @@ class Router:
|
||||||
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
|
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
|
||||||
|
|
||||||
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
||||||
|
self.cooldown_time = cooldown_time or 1
|
||||||
self.failed_calls = (
|
self.failed_calls = (
|
||||||
InMemoryCache()
|
InMemoryCache()
|
||||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||||
|
@ -1365,6 +1399,7 @@ class Router:
|
||||||
verbose_router_logger.debug(
|
verbose_router_logger.debug(
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
||||||
)
|
)
|
||||||
|
cooldown_time = self.cooldown_time or 1
|
||||||
if updated_fails > self.allowed_fails:
|
if updated_fails > self.allowed_fails:
|
||||||
# get the current cooldown list for that minute
|
# get the current cooldown list for that minute
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
|
@ -1378,13 +1413,19 @@ class Router:
|
||||||
else:
|
else:
|
||||||
cached_value = cached_value + [deployment]
|
cached_value = cached_value + [deployment]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
|
self.cache.set_cache(
|
||||||
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
cached_value = [deployment]
|
cached_value = [deployment]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
|
self.cache.set_cache(
|
||||||
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1)
|
self.failed_calls.set_cache(
|
||||||
|
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||||
|
)
|
||||||
|
|
||||||
def _get_cooldown_deployments(self):
|
def _get_cooldown_deployments(self):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -796,3 +796,93 @@ def test_usage_based_routing_fallbacks():
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred {e}")
|
pytest.fail(f"An exception occurred {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_cooldown_times():
|
||||||
|
try:
|
||||||
|
# set, custom_cooldown. Failed model in cooldown_models, after custom_cooldown, the failed model is no longer in cooldown_models
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": "bad-key",
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 24000000,
|
||||||
|
},
|
||||||
|
{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 1,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
litellm.set_verbose = False
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="INFO",
|
||||||
|
cooldown_time=0.1,
|
||||||
|
redis_host=os.getenv("REDIS_HOST"),
|
||||||
|
redis_password=os.getenv("REDIS_PASSWORD"),
|
||||||
|
redis_port=int(os.getenv("REDIS_PORT")),
|
||||||
|
)
|
||||||
|
|
||||||
|
# make a request - expect it to fail
|
||||||
|
try:
|
||||||
|
response = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"content": "Tell me a joke.",
|
||||||
|
"role": "user",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# expect 1 model to be in cooldown models
|
||||||
|
cooldown_deployments = router._get_cooldown_deployments()
|
||||||
|
print("cooldown_deployments after failed call: ", cooldown_deployments)
|
||||||
|
assert (
|
||||||
|
len(cooldown_deployments) == 1
|
||||||
|
), "Expected 1 model to be in cooldown models"
|
||||||
|
|
||||||
|
selected_cooldown_model = cooldown_deployments[0]
|
||||||
|
|
||||||
|
# wait for 1/2 of cooldown time
|
||||||
|
time.sleep(router.cooldown_time / 2)
|
||||||
|
|
||||||
|
# expect cooldown model to still be in cooldown models
|
||||||
|
cooldown_deployments = router._get_cooldown_deployments()
|
||||||
|
print(
|
||||||
|
"cooldown_deployments after waiting 1/2 of cooldown: ", cooldown_deployments
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
len(cooldown_deployments) == 1
|
||||||
|
), "Expected 1 model to be in cooldown models"
|
||||||
|
|
||||||
|
# wait for 1/2 of cooldown time again, now we've waited for full cooldown
|
||||||
|
time.sleep(router.cooldown_time / 2)
|
||||||
|
|
||||||
|
# expect cooldown model to be removed from cooldown models
|
||||||
|
cooldown_deployments = router._get_cooldown_deployments()
|
||||||
|
print(
|
||||||
|
"cooldown_deployments after waiting cooldown time: ", cooldown_deployments
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
len(cooldown_deployments) == 0
|
||||||
|
), "Expected 0 models to be in cooldown models"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue