mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
fix(router.py): correctly raise no model available error
https://github.com/BerriAI/litellm/issues/1289
This commit is contained in:
parent
91e6b2e79a
commit
a83e2e07cf
4 changed files with 86 additions and 21 deletions
|
@ -1705,7 +1705,6 @@ class Router:
|
||||||
deployment = self.leastbusy_logger.get_available_deployments(
|
deployment = self.leastbusy_logger.get_available_deployments(
|
||||||
model_group=model, healthy_deployments=healthy_deployments
|
model_group=model, healthy_deployments=healthy_deployments
|
||||||
)
|
)
|
||||||
return deployment
|
|
||||||
elif self.routing_strategy == "simple-shuffle":
|
elif self.routing_strategy == "simple-shuffle":
|
||||||
# if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
|
# if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
|
||||||
############## Check if we can do a RPM/TPM based weighted pick #################
|
############## Check if we can do a RPM/TPM based weighted pick #################
|
||||||
|
@ -1744,24 +1743,24 @@ class Router:
|
||||||
self.routing_strategy == "latency-based-routing"
|
self.routing_strategy == "latency-based-routing"
|
||||||
and self.lowestlatency_logger is not None
|
and self.lowestlatency_logger is not None
|
||||||
):
|
):
|
||||||
min_deployment = self.lowestlatency_logger.get_available_deployments(
|
deployment = self.lowestlatency_logger.get_available_deployments(
|
||||||
model_group=model, healthy_deployments=healthy_deployments
|
model_group=model, healthy_deployments=healthy_deployments
|
||||||
)
|
)
|
||||||
if min_deployment is None:
|
|
||||||
min_deployment = random.choice(healthy_deployments)
|
|
||||||
return min_deployment
|
|
||||||
elif (
|
elif (
|
||||||
self.routing_strategy == "usage-based-routing"
|
self.routing_strategy == "usage-based-routing"
|
||||||
and self.lowesttpm_logger is not None
|
and self.lowesttpm_logger is not None
|
||||||
):
|
):
|
||||||
min_deployment = self.lowesttpm_logger.get_available_deployments(
|
deployment = self.lowesttpm_logger.get_available_deployments(
|
||||||
model_group=model, healthy_deployments=healthy_deployments
|
model_group=model,
|
||||||
|
healthy_deployments=healthy_deployments,
|
||||||
|
messages=messages,
|
||||||
|
input=input,
|
||||||
)
|
)
|
||||||
if min_deployment is None:
|
|
||||||
min_deployment = random.choice(healthy_deployments)
|
|
||||||
return min_deployment
|
|
||||||
|
|
||||||
raise ValueError("No models available.")
|
if deployment is None:
|
||||||
|
raise ValueError("No models available.")
|
||||||
|
|
||||||
|
return deployment
|
||||||
|
|
||||||
def flush_cache(self):
|
def flush_cache(self):
|
||||||
litellm.cache = None
|
litellm.cache = None
|
||||||
|
|
|
@ -2,11 +2,12 @@
|
||||||
# identifies lowest tpm deployment
|
# identifies lowest tpm deployment
|
||||||
|
|
||||||
import dotenv, os, requests, random
|
import dotenv, os, requests, random
|
||||||
from typing import Optional
|
from typing import Optional, Union, List, Dict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||||
import traceback
|
import traceback
|
||||||
|
from litellm import token_counter
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
|
||||||
|
@ -118,7 +119,13 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_available_deployments(self, model_group: str, healthy_deployments: list):
|
def get_available_deployments(
|
||||||
|
self,
|
||||||
|
model_group: str,
|
||||||
|
healthy_deployments: list,
|
||||||
|
messages: Optional[List[Dict[str, str]]] = None,
|
||||||
|
input: Optional[Union[str, List]] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Returns a deployment with the lowest TPM/RPM usage.
|
Returns a deployment with the lowest TPM/RPM usage.
|
||||||
"""
|
"""
|
||||||
|
@ -145,6 +152,7 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
if d["model_info"]["id"] not in all_deployments:
|
if d["model_info"]["id"] not in all_deployments:
|
||||||
all_deployments[d["model_info"]["id"]] = 0
|
all_deployments[d["model_info"]["id"]] = 0
|
||||||
|
|
||||||
|
input_tokens = token_counter(messages=messages, text=input)
|
||||||
for item, item_tpm in all_deployments.items():
|
for item, item_tpm in all_deployments.items():
|
||||||
## get the item from model list
|
## get the item from model list
|
||||||
_deployment = None
|
_deployment = None
|
||||||
|
@ -173,12 +181,11 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
deployment = _deployment
|
deployment = _deployment
|
||||||
break
|
break
|
||||||
elif (
|
elif (
|
||||||
item_tpm > _deployment_tpm or rpm_dict[item] + 1 >= _deployment_rpm
|
item_tpm + input_tokens > _deployment_tpm
|
||||||
|
or rpm_dict[item] + 1 >= _deployment_rpm
|
||||||
): # if user passed in tpm / rpm in the model_list
|
): # if user passed in tpm / rpm in the model_list
|
||||||
continue
|
continue
|
||||||
elif item_tpm < lowest_tpm:
|
elif item_tpm < lowest_tpm:
|
||||||
lowest_tpm = item_tpm
|
lowest_tpm = item_tpm
|
||||||
deployment = _deployment
|
deployment = _deployment
|
||||||
if deployment is None:
|
|
||||||
deployment = random.choice(healthy_deployments)
|
|
||||||
return deployment
|
return deployment
|
||||||
|
|
|
@ -215,11 +215,64 @@ def test_router_get_available_deployments():
|
||||||
|
|
||||||
|
|
||||||
# test_get_available_deployments()
|
# test_get_available_deployments()
|
||||||
|
|
||||||
|
|
||||||
# test_router_get_available_deployments()
|
# test_router_get_available_deployments()
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_skip_rate_limited_deployments():
|
||||||
|
"""
|
||||||
|
Test if routers 'get_available_deployments' raises No Models Available error if max tpm would be reached by message
|
||||||
|
"""
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-turbo",
|
||||||
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
|
"tpm": 1440,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 1},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
routing_strategy="usage-based-routing",
|
||||||
|
set_verbose=False,
|
||||||
|
num_retries=3,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
## DEPLOYMENT 1 ##
|
||||||
|
deployment_id = 1
|
||||||
|
kwargs = {
|
||||||
|
"litellm_params": {
|
||||||
|
"metadata": {
|
||||||
|
"model_group": "azure-model",
|
||||||
|
},
|
||||||
|
"model_info": {"id": deployment_id},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start_time = time.time()
|
||||||
|
response_obj = {"usage": {"total_tokens": 1439}}
|
||||||
|
end_time = time.time()
|
||||||
|
router.lowesttpm_logger.log_success_event(
|
||||||
|
response_obj=response_obj,
|
||||||
|
kwargs=kwargs,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
## CHECK WHAT'S SELECTED ## - should skip 2, and pick 1
|
||||||
|
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
|
||||||
|
try:
|
||||||
|
router.get_available_deployment(
|
||||||
|
model="azure-model",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
pytest.fail(f"Should have raised No Models Available error")
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_router_completion_streaming():
|
async def test_router_completion_streaming():
|
||||||
messages = [
|
messages = [
|
||||||
|
|
|
@ -2504,7 +2504,11 @@ def openai_token_counter(
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
|
||||||
|
|
||||||
def token_counter(model="", text=None, messages: Optional[List] = None):
|
def token_counter(
|
||||||
|
model="",
|
||||||
|
text: Optional[Union[str, List[str]]] = None,
|
||||||
|
messages: Optional[List] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Count the number of tokens in a given text using a specified model.
|
Count the number of tokens in a given text using a specified model.
|
||||||
|
|
||||||
|
@ -2533,6 +2537,8 @@ def token_counter(model="", text=None, messages: Optional[List] = None):
|
||||||
text += function_arguments
|
text += function_arguments
|
||||||
else:
|
else:
|
||||||
raise ValueError("text and messages cannot both be None")
|
raise ValueError("text and messages cannot both be None")
|
||||||
|
elif isinstance(text, List):
|
||||||
|
text = "".join(t for t in text if isinstance(t, str))
|
||||||
num_tokens = 0
|
num_tokens = 0
|
||||||
if model is not None:
|
if model is not None:
|
||||||
tokenizer_json = _select_tokenizer(model=model)
|
tokenizer_json = _select_tokenizer(model=model)
|
||||||
|
@ -2545,13 +2551,13 @@ def token_counter(model="", text=None, messages: Optional[List] = None):
|
||||||
or model in litellm.azure_llms
|
or model in litellm.azure_llms
|
||||||
):
|
):
|
||||||
num_tokens = openai_token_counter(
|
num_tokens = openai_token_counter(
|
||||||
text=text, model=model, messages=messages, is_tool_call=is_tool_call
|
text=text, model=model, messages=messages, is_tool_call=is_tool_call # type: ignore
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
enc = tokenizer_json["tokenizer"].encode(text)
|
enc = tokenizer_json["tokenizer"].encode(text)
|
||||||
num_tokens = len(enc)
|
num_tokens = len(enc)
|
||||||
else:
|
else:
|
||||||
num_tokens = len(encoding.encode(text))
|
num_tokens = len(encoding.encode(text)) # type: ignore
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue