diff --git a/litellm/router.py b/litellm/router.py index 1d751930e9..5d1adf2b54 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1705,7 +1705,6 @@ class Router: deployment = self.leastbusy_logger.get_available_deployments( model_group=model, healthy_deployments=healthy_deployments ) - return deployment elif self.routing_strategy == "simple-shuffle": # if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm ############## Check if we can do a RPM/TPM based weighted pick ################# @@ -1744,24 +1743,24 @@ class Router: self.routing_strategy == "latency-based-routing" and self.lowestlatency_logger is not None ): - min_deployment = self.lowestlatency_logger.get_available_deployments( + deployment = self.lowestlatency_logger.get_available_deployments( model_group=model, healthy_deployments=healthy_deployments ) - if min_deployment is None: - min_deployment = random.choice(healthy_deployments) - return min_deployment elif ( self.routing_strategy == "usage-based-routing" and self.lowesttpm_logger is not None ): - min_deployment = self.lowesttpm_logger.get_available_deployments( - model_group=model, healthy_deployments=healthy_deployments + deployment = self.lowesttpm_logger.get_available_deployments( + model_group=model, + healthy_deployments=healthy_deployments, + messages=messages, + input=input, ) - if min_deployment is None: - min_deployment = random.choice(healthy_deployments) - return min_deployment - raise ValueError("No models available.") + if deployment is None: + raise ValueError("No models available.") + + return deployment def flush_cache(self): litellm.cache = None diff --git a/litellm/router_strategy/lowest_tpm_rpm.py b/litellm/router_strategy/lowest_tpm_rpm.py index 1c287b28e9..4b492cdedb 100644 --- a/litellm/router_strategy/lowest_tpm_rpm.py +++ b/litellm/router_strategy/lowest_tpm_rpm.py @@ -2,11 +2,12 @@ # identifies lowest tpm deployment import dotenv, os, requests, random -from typing import Optional +from typing import Optional, Union, List, Dict from datetime import datetime dotenv.load_dotenv() # Loading env variables using dotenv import traceback +from litellm import token_counter from litellm.caching import DualCache from litellm.integrations.custom_logger import CustomLogger @@ -118,7 +119,13 @@ class LowestTPMLoggingHandler(CustomLogger): traceback.print_exc() pass - def get_available_deployments(self, model_group: str, healthy_deployments: list): + def get_available_deployments( + self, + model_group: str, + healthy_deployments: list, + messages: Optional[List[Dict[str, str]]] = None, + input: Optional[Union[str, List]] = None, + ): """ Returns a deployment with the lowest TPM/RPM usage. """ @@ -145,6 +152,7 @@ class LowestTPMLoggingHandler(CustomLogger): if d["model_info"]["id"] not in all_deployments: all_deployments[d["model_info"]["id"]] = 0 + input_tokens = token_counter(messages=messages, text=input) for item, item_tpm in all_deployments.items(): ## get the item from model list _deployment = None @@ -173,12 +181,11 @@ class LowestTPMLoggingHandler(CustomLogger): deployment = _deployment break elif ( - item_tpm > _deployment_tpm or rpm_dict[item] + 1 >= _deployment_rpm + item_tpm + input_tokens > _deployment_tpm + or rpm_dict[item] + 1 >= _deployment_rpm ): # if user passed in tpm / rpm in the model_list continue elif item_tpm < lowest_tpm: lowest_tpm = item_tpm deployment = _deployment - if deployment is None: - deployment = random.choice(healthy_deployments) return deployment diff --git a/litellm/tests/test_tpm_rpm_routing.py b/litellm/tests/test_tpm_rpm_routing.py index 4c48a1c9d4..6f45d1658a 100644 --- a/litellm/tests/test_tpm_rpm_routing.py +++ b/litellm/tests/test_tpm_rpm_routing.py @@ -215,11 +215,64 @@ def test_router_get_available_deployments(): # test_get_available_deployments() - - # test_router_get_available_deployments() +def test_router_skip_rate_limited_deployments(): + """ + Test if routers 'get_available_deployments' raises No Models Available error if max tpm would be reached by message + """ + model_list = [ + { + "model_name": "azure-model", + "litellm_params": { + "model": "azure/gpt-turbo", + "api_key": "os.environ/AZURE_FRANCE_API_KEY", + "api_base": "https://openai-france-1234.openai.azure.com", + "tpm": 1440, + }, + "model_info": {"id": 1}, + }, + ] + router = Router( + model_list=model_list, + routing_strategy="usage-based-routing", + set_verbose=False, + num_retries=3, + ) # type: ignore + + ## DEPLOYMENT 1 ## + deployment_id = 1 + kwargs = { + "litellm_params": { + "metadata": { + "model_group": "azure-model", + }, + "model_info": {"id": deployment_id}, + } + } + start_time = time.time() + response_obj = {"usage": {"total_tokens": 1439}} + end_time = time.time() + router.lowesttpm_logger.log_success_event( + response_obj=response_obj, + kwargs=kwargs, + start_time=start_time, + end_time=end_time, + ) + + ## CHECK WHAT'S SELECTED ## - should skip 2, and pick 1 + # print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model")) + try: + router.get_available_deployment( + model="azure-model", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + pytest.fail(f"Should have raised No Models Available error") + except Exception as e: + pass + + @pytest.mark.asyncio async def test_router_completion_streaming(): messages = [ diff --git a/litellm/utils.py b/litellm/utils.py index 1dc2b8470d..46fdd87f45 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2504,7 +2504,11 @@ def openai_token_counter( return num_tokens -def token_counter(model="", text=None, messages: Optional[List] = None): +def token_counter( + model="", + text: Optional[Union[str, List[str]]] = None, + messages: Optional[List] = None, +): """ Count the number of tokens in a given text using a specified model. @@ -2533,6 +2537,8 @@ def token_counter(model="", text=None, messages: Optional[List] = None): text += function_arguments else: raise ValueError("text and messages cannot both be None") + elif isinstance(text, List): + text = "".join(t for t in text if isinstance(t, str)) num_tokens = 0 if model is not None: tokenizer_json = _select_tokenizer(model=model) @@ -2545,13 +2551,13 @@ def token_counter(model="", text=None, messages: Optional[List] = None): or model in litellm.azure_llms ): num_tokens = openai_token_counter( - text=text, model=model, messages=messages, is_tool_call=is_tool_call + text=text, model=model, messages=messages, is_tool_call=is_tool_call # type: ignore ) else: enc = tokenizer_json["tokenizer"].encode(text) num_tokens = len(enc) else: - num_tokens = len(encoding.encode(text)) + num_tokens = len(encoding.encode(text)) # type: ignore return num_tokens