fix(router.py): correctly raise no model available error

https://github.com/BerriAI/litellm/issues/1289
This commit is contained in:
Krrish Dholakia 2024-01-01 21:22:42 +05:30
parent 91e6b2e79a
commit a83e2e07cf
4 changed files with 86 additions and 21 deletions

View file

@ -1705,7 +1705,6 @@ class Router:
deployment = self.leastbusy_logger.get_available_deployments( deployment = self.leastbusy_logger.get_available_deployments(
model_group=model, healthy_deployments=healthy_deployments model_group=model, healthy_deployments=healthy_deployments
) )
return deployment
elif self.routing_strategy == "simple-shuffle": elif self.routing_strategy == "simple-shuffle":
# if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm # if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
############## Check if we can do a RPM/TPM based weighted pick ################# ############## Check if we can do a RPM/TPM based weighted pick #################
@ -1744,24 +1743,24 @@ class Router:
self.routing_strategy == "latency-based-routing" self.routing_strategy == "latency-based-routing"
and self.lowestlatency_logger is not None and self.lowestlatency_logger is not None
): ):
min_deployment = self.lowestlatency_logger.get_available_deployments( deployment = self.lowestlatency_logger.get_available_deployments(
model_group=model, healthy_deployments=healthy_deployments model_group=model, healthy_deployments=healthy_deployments
) )
if min_deployment is None:
min_deployment = random.choice(healthy_deployments)
return min_deployment
elif ( elif (
self.routing_strategy == "usage-based-routing" self.routing_strategy == "usage-based-routing"
and self.lowesttpm_logger is not None and self.lowesttpm_logger is not None
): ):
min_deployment = self.lowesttpm_logger.get_available_deployments( deployment = self.lowesttpm_logger.get_available_deployments(
model_group=model, healthy_deployments=healthy_deployments model_group=model,
healthy_deployments=healthy_deployments,
messages=messages,
input=input,
) )
if min_deployment is None:
min_deployment = random.choice(healthy_deployments)
return min_deployment
raise ValueError("No models available.") if deployment is None:
raise ValueError("No models available.")
return deployment
def flush_cache(self): def flush_cache(self):
litellm.cache = None litellm.cache = None

View file

@ -2,11 +2,12 @@
# identifies lowest tpm deployment # identifies lowest tpm deployment
import dotenv, os, requests, random import dotenv, os, requests, random
from typing import Optional from typing import Optional, Union, List, Dict
from datetime import datetime from datetime import datetime
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
from litellm import token_counter
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
@ -118,7 +119,13 @@ class LowestTPMLoggingHandler(CustomLogger):
traceback.print_exc() traceback.print_exc()
pass pass
def get_available_deployments(self, model_group: str, healthy_deployments: list): def get_available_deployments(
self,
model_group: str,
healthy_deployments: list,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
):
""" """
Returns a deployment with the lowest TPM/RPM usage. Returns a deployment with the lowest TPM/RPM usage.
""" """
@ -145,6 +152,7 @@ class LowestTPMLoggingHandler(CustomLogger):
if d["model_info"]["id"] not in all_deployments: if d["model_info"]["id"] not in all_deployments:
all_deployments[d["model_info"]["id"]] = 0 all_deployments[d["model_info"]["id"]] = 0
input_tokens = token_counter(messages=messages, text=input)
for item, item_tpm in all_deployments.items(): for item, item_tpm in all_deployments.items():
## get the item from model list ## get the item from model list
_deployment = None _deployment = None
@ -173,12 +181,11 @@ class LowestTPMLoggingHandler(CustomLogger):
deployment = _deployment deployment = _deployment
break break
elif ( elif (
item_tpm > _deployment_tpm or rpm_dict[item] + 1 >= _deployment_rpm item_tpm + input_tokens > _deployment_tpm
or rpm_dict[item] + 1 >= _deployment_rpm
): # if user passed in tpm / rpm in the model_list ): # if user passed in tpm / rpm in the model_list
continue continue
elif item_tpm < lowest_tpm: elif item_tpm < lowest_tpm:
lowest_tpm = item_tpm lowest_tpm = item_tpm
deployment = _deployment deployment = _deployment
if deployment is None:
deployment = random.choice(healthy_deployments)
return deployment return deployment

View file

@ -215,11 +215,64 @@ def test_router_get_available_deployments():
# test_get_available_deployments() # test_get_available_deployments()
# test_router_get_available_deployments() # test_router_get_available_deployments()
def test_router_skip_rate_limited_deployments():
"""
Test if routers 'get_available_deployments' raises No Models Available error if max tpm would be reached by message
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"tpm": 1440,
},
"model_info": {"id": 1},
},
]
router = Router(
model_list=model_list,
routing_strategy="usage-based-routing",
set_verbose=False,
num_retries=3,
) # type: ignore
## DEPLOYMENT 1 ##
deployment_id = 1
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": deployment_id},
}
}
start_time = time.time()
response_obj = {"usage": {"total_tokens": 1439}}
end_time = time.time()
router.lowesttpm_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## CHECK WHAT'S SELECTED ## - should skip 2, and pick 1
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
try:
router.get_available_deployment(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
pytest.fail(f"Should have raised No Models Available error")
except Exception as e:
pass
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_router_completion_streaming(): async def test_router_completion_streaming():
messages = [ messages = [

View file

@ -2504,7 +2504,11 @@ def openai_token_counter(
return num_tokens return num_tokens
def token_counter(model="", text=None, messages: Optional[List] = None): def token_counter(
model="",
text: Optional[Union[str, List[str]]] = None,
messages: Optional[List] = None,
):
""" """
Count the number of tokens in a given text using a specified model. Count the number of tokens in a given text using a specified model.
@ -2533,6 +2537,8 @@ def token_counter(model="", text=None, messages: Optional[List] = None):
text += function_arguments text += function_arguments
else: else:
raise ValueError("text and messages cannot both be None") raise ValueError("text and messages cannot both be None")
elif isinstance(text, List):
text = "".join(t for t in text if isinstance(t, str))
num_tokens = 0 num_tokens = 0
if model is not None: if model is not None:
tokenizer_json = _select_tokenizer(model=model) tokenizer_json = _select_tokenizer(model=model)
@ -2545,13 +2551,13 @@ def token_counter(model="", text=None, messages: Optional[List] = None):
or model in litellm.azure_llms or model in litellm.azure_llms
): ):
num_tokens = openai_token_counter( num_tokens = openai_token_counter(
text=text, model=model, messages=messages, is_tool_call=is_tool_call text=text, model=model, messages=messages, is_tool_call=is_tool_call # type: ignore
) )
else: else:
enc = tokenizer_json["tokenizer"].encode(text) enc = tokenizer_json["tokenizer"].encode(text)
num_tokens = len(enc) num_tokens = len(enc)
else: else:
num_tokens = len(encoding.encode(text)) num_tokens = len(encoding.encode(text)) # type: ignore
return num_tokens return num_tokens