forked from phoenix/litellm-mirror
(fix) router - allow users to call a specific_model explicit
This commit is contained in:
parent
acef6bd58d
commit
8e6c4c5310
2 changed files with 27 additions and 16 deletions
|
@ -861,7 +861,7 @@ async def chat_completion(request: Request, model: Optional[str] = None, user_ap
|
||||||
if llm_router is not None and data["model"] in router_model_names: # model in router model list
|
if llm_router is not None and data["model"] in router_model_names: # model in router model list
|
||||||
response = await llm_router.acompletion(**data)
|
response = await llm_router.acompletion(**data)
|
||||||
elif llm_router is not None and data["model"] in llm_router.deployment_names: # model in router deployments, calling a specific deployment on the router
|
elif llm_router is not None and data["model"] in llm_router.deployment_names: # model in router deployments, calling a specific deployment on the router
|
||||||
response = await llm_router.acompletion(**data)
|
response = await llm_router.acompletion(**data, specific_deployment = True)
|
||||||
else: # router is not set
|
else: # router is not set
|
||||||
response = await litellm.acompletion(**data)
|
response = await litellm.acompletion(**data)
|
||||||
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
||||||
|
@ -923,7 +923,7 @@ async def embeddings(request: Request, user_api_key_dict: UserAPIKeyAuth = Depen
|
||||||
if llm_router is not None and data["model"] in router_model_names: # model in router model list
|
if llm_router is not None and data["model"] in router_model_names: # model in router model list
|
||||||
response = await llm_router.aembedding(**data)
|
response = await llm_router.aembedding(**data)
|
||||||
elif llm_router is not None and data["model"] in llm_router.deployment_names: # model in router deployments, calling a specific deployment on the router
|
elif llm_router is not None and data["model"] in llm_router.deployment_names: # model in router deployments, calling a specific deployment on the router
|
||||||
response = await llm_router.aembedding(**data)
|
response = await llm_router.aembedding(**data, specific_deployment = True)
|
||||||
else:
|
else:
|
||||||
response = await litellm.aembedding(**data)
|
response = await litellm.aembedding(**data)
|
||||||
background_tasks.add_task(log_input_output, request, response) # background task for logging to OTEL
|
background_tasks.add_task(log_input_output, request, response) # background task for logging to OTEL
|
||||||
|
|
|
@ -184,7 +184,7 @@ class Router:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
deployment = self.get_available_deployment(model=model, messages=messages, specific_deployment=kwargs.pop("specific_deployment", None))
|
||||||
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
|
@ -232,7 +232,7 @@ class Router:
|
||||||
try:
|
try:
|
||||||
self.print_verbose(f"Inside _acompletion()- model: {model}; kwargs: {kwargs}")
|
self.print_verbose(f"Inside _acompletion()- model: {model}; kwargs: {kwargs}")
|
||||||
original_model_string = None # set a default for this variable
|
original_model_string = None # set a default for this variable
|
||||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
deployment = self.get_available_deployment(model=model, messages=messages, specific_deployment=kwargs.pop("specific_deployment", None))
|
||||||
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
|
@ -268,7 +268,7 @@ class Router:
|
||||||
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
messages=[{"role": "user", "content": prompt}]
|
messages=[{"role": "user", "content": prompt}]
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
deployment = self.get_available_deployment(model=model, messages=messages, specific_deployment=kwargs.pop("specific_deployment", None))
|
||||||
|
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
|
@ -301,7 +301,7 @@ class Router:
|
||||||
is_async: Optional[bool] = False,
|
is_async: Optional[bool] = False,
|
||||||
**kwargs) -> Union[List[float], None]:
|
**kwargs) -> Union[List[float], None]:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(model=model, input=input)
|
deployment = self.get_available_deployment(model=model, input=input, specific_deployment=kwargs.pop("specific_deployment", None))
|
||||||
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
|
@ -326,7 +326,7 @@ class Router:
|
||||||
is_async: Optional[bool] = True,
|
is_async: Optional[bool] = True,
|
||||||
**kwargs) -> Union[List[float], None]:
|
**kwargs) -> Union[List[float], None]:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(model=model, input=input)
|
deployment = self.get_available_deployment(model=model, input=input, specific_deployment=kwargs.pop("specific_deployment", None))
|
||||||
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
kwargs.setdefault("metadata", {}).update({"deployment": deployment["litellm_params"]["model"]})
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
|
@ -358,7 +358,7 @@ class Router:
|
||||||
self.print_verbose(f'Async Response: {response}')
|
self.print_verbose(f'Async Response: {response}')
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print_verbose(f"An exception occurs")
|
self.print_verbose(f"An exception occurs: {e}")
|
||||||
original_exception = e
|
original_exception = e
|
||||||
try:
|
try:
|
||||||
self.print_verbose(f"Trying to fallback b/w models")
|
self.print_verbose(f"Trying to fallback b/w models")
|
||||||
|
@ -1031,6 +1031,8 @@ class Router:
|
||||||
model_id+= str(model["litellm_params"][key])
|
model_id+= str(model["litellm_params"][key])
|
||||||
model["litellm_params"]["model"] += "-ModelID-" + model_id
|
model["litellm_params"]["model"] += "-ModelID-" + model_id
|
||||||
|
|
||||||
|
self.print_verbose(f"\n Initialized Model List {self.model_list}")
|
||||||
|
|
||||||
############ Users can either pass tpm/rpm as a litellm_param or a router param ###########
|
############ Users can either pass tpm/rpm as a litellm_param or a router param ###########
|
||||||
# for get_available_deployment, we use the litellm_param["rpm"]
|
# for get_available_deployment, we use the litellm_param["rpm"]
|
||||||
# in this snippet we also set rpm to be a litellm_param
|
# in this snippet we also set rpm to be a litellm_param
|
||||||
|
@ -1074,16 +1076,32 @@ class Router:
|
||||||
def get_available_deployment(self,
|
def get_available_deployment(self,
|
||||||
model: str,
|
model: str,
|
||||||
messages: Optional[List[Dict[str, str]]] = None,
|
messages: Optional[List[Dict[str, str]]] = None,
|
||||||
input: Optional[Union[str, List]] = None):
|
input: Optional[Union[str, List]] = None,
|
||||||
|
specific_deployment: Optional[bool] = False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Returns the deployment based on routing strategy
|
Returns the deployment based on routing strategy
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# users need to explicitly call a specific deployment, by setting `specific_deployment = True` as completion()/embedding() kwarg
|
||||||
|
# When this was no explicit we had several issues with fallbacks timing out
|
||||||
|
if specific_deployment == True:
|
||||||
|
# users can also specify a specific deployment name. At this point we should check if they are just trying to call a specific deployment
|
||||||
|
for deployment in self.model_list:
|
||||||
|
cleaned_model = litellm.utils.remove_model_id(deployment.get("litellm_params").get("model"))
|
||||||
|
if cleaned_model == model:
|
||||||
|
# User Passed a specific deployment name on their config.yaml, example azure/chat-gpt-v-2
|
||||||
|
# return the first deployment where the `model` matches the specificed deployment name
|
||||||
|
return deployment
|
||||||
|
raise ValueError(f"LiteLLM Router: Trying to call specific deployment, but Model:{model} does not exist in Model List: {self.model_list}")
|
||||||
|
|
||||||
## get healthy deployments
|
## get healthy deployments
|
||||||
### get all deployments
|
### get all deployments
|
||||||
### filter out the deployments currently cooling down
|
### filter out the deployments currently cooling down
|
||||||
healthy_deployments = [m for m in self.model_list if m["model_name"] == model]
|
healthy_deployments = [m for m in self.model_list if m["model_name"] == model]
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
# check if the user sent in a deployment name instead
|
# check if the user sent in a deployment name instead
|
||||||
|
|
||||||
healthy_deployments = [m for m in self.model_list if m["litellm_params"]["model"] == model]
|
healthy_deployments = [m for m in self.model_list if m["litellm_params"]["model"] == model]
|
||||||
self.print_verbose(f"initial list of deployments: {healthy_deployments}")
|
self.print_verbose(f"initial list of deployments: {healthy_deployments}")
|
||||||
deployments_to_remove = []
|
deployments_to_remove = []
|
||||||
|
@ -1099,13 +1117,6 @@ class Router:
|
||||||
healthy_deployments.remove(deployment)
|
healthy_deployments.remove(deployment)
|
||||||
self.print_verbose(f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}")
|
self.print_verbose(f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}")
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
# users can also specify a specific deployment name. At this point we should check if they are just trying to call a specific deployment
|
|
||||||
for deployment in self.model_list:
|
|
||||||
cleaned_model = litellm.utils.remove_model_id(deployment.get("litellm_params").get("model"))
|
|
||||||
if cleaned_model == model:
|
|
||||||
# User Passed a specific deployment name on their config.yaml, example azure/chat-gpt-v-2
|
|
||||||
# return the first deployment where the `model` matches the specificed deployment name
|
|
||||||
return deployment
|
|
||||||
raise ValueError("No models available")
|
raise ValueError("No models available")
|
||||||
if litellm.model_alias_map and model in litellm.model_alias_map:
|
if litellm.model_alias_map and model in litellm.model_alias_map:
|
||||||
model = litellm.model_alias_map[
|
model = litellm.model_alias_map[
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue