fix - set latency stats in kwargs

This commit is contained in:
Ishaan Jaff 2024-04-24 20:13:45 -07:00
parent 1c109086ac
commit 5dae1cf303
2 changed files with 19 additions and 1 deletions

View file

@ -454,6 +454,7 @@ class Router:
model=model, model=model,
messages=messages, messages=messages,
specific_deployment=kwargs.pop("specific_deployment", None), specific_deployment=kwargs.pop("specific_deployment", None),
request_kwargs=kwargs,
) )
# debug how often this deployment picked # debug how often this deployment picked
@ -2818,6 +2819,7 @@ class Router:
messages: Optional[List[Dict[str, str]]] = None, messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None, input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False, specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
): ):
""" """
Async implementation of 'get_available_deployments'. Async implementation of 'get_available_deployments'.
@ -2833,6 +2835,7 @@ class Router:
messages=messages, messages=messages,
input=input, input=input,
specific_deployment=specific_deployment, specific_deployment=specific_deployment,
request_kwargs=request_kwargs,
) )
model, healthy_deployments = self._common_checks_available_deployment( model, healthy_deployments = self._common_checks_available_deployment(
@ -2936,6 +2939,7 @@ class Router:
messages: Optional[List[Dict[str, str]]] = None, messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None, input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False, specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
): ):
""" """
Returns the deployment based on routing strategy Returns the deployment based on routing strategy
@ -3022,7 +3026,9 @@ class Router:
and self.lowestlatency_logger is not None and self.lowestlatency_logger is not None
): ):
deployment = self.lowestlatency_logger.get_available_deployments( deployment = self.lowestlatency_logger.get_available_deployments(
model_group=model, healthy_deployments=healthy_deployments model_group=model,
healthy_deployments=healthy_deployments,
request_kwargs=request_kwargs,
) )
elif ( elif (
self.routing_strategy == "usage-based-routing" self.routing_strategy == "usage-based-routing"

View file

@ -272,12 +272,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
healthy_deployments: list, healthy_deployments: list,
messages: Optional[List[Dict[str, str]]] = None, messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None, input: Optional[Union[str, List]] = None,
request_kwargs: Optional[Dict] = None,
): ):
""" """
Returns a deployment with the lowest latency Returns a deployment with the lowest latency
""" """
# get list of potential deployments # get list of potential deployments
latency_key = f"{model_group}_map" latency_key = f"{model_group}_map"
_latency_per_deployment = {}
request_count_dict = self.router_cache.get_cache(key=latency_key) or {} request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
@ -354,4 +356,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
elif item_latency < lowest_latency: elif item_latency < lowest_latency:
lowest_latency = item_latency lowest_latency = item_latency
deployment = _deployment deployment = _deployment
# _latency_per_deployment is used for debuggig
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
_latency_per_deployment[_deployment_api_base] = item_latency
if request_kwargs is not None and "metadata" in request_kwargs:
request_kwargs["metadata"][
"_latency_per_deployment"
] = _latency_per_deployment
return deployment return deployment