mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
fix - set latency stats in kwargs
This commit is contained in:
parent
1c109086ac
commit
5dae1cf303
2 changed files with 19 additions and 1 deletions
|
@ -454,6 +454,7 @@ class Router:
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
specific_deployment=kwargs.pop("specific_deployment", None),
|
specific_deployment=kwargs.pop("specific_deployment", None),
|
||||||
|
request_kwargs=kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# debug how often this deployment picked
|
# debug how often this deployment picked
|
||||||
|
@ -2818,6 +2819,7 @@ class Router:
|
||||||
messages: Optional[List[Dict[str, str]]] = None,
|
messages: Optional[List[Dict[str, str]]] = None,
|
||||||
input: Optional[Union[str, List]] = None,
|
input: Optional[Union[str, List]] = None,
|
||||||
specific_deployment: Optional[bool] = False,
|
specific_deployment: Optional[bool] = False,
|
||||||
|
request_kwargs: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Async implementation of 'get_available_deployments'.
|
Async implementation of 'get_available_deployments'.
|
||||||
|
@ -2833,6 +2835,7 @@ class Router:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
input=input,
|
input=input,
|
||||||
specific_deployment=specific_deployment,
|
specific_deployment=specific_deployment,
|
||||||
|
request_kwargs=request_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
model, healthy_deployments = self._common_checks_available_deployment(
|
model, healthy_deployments = self._common_checks_available_deployment(
|
||||||
|
@ -2936,6 +2939,7 @@ class Router:
|
||||||
messages: Optional[List[Dict[str, str]]] = None,
|
messages: Optional[List[Dict[str, str]]] = None,
|
||||||
input: Optional[Union[str, List]] = None,
|
input: Optional[Union[str, List]] = None,
|
||||||
specific_deployment: Optional[bool] = False,
|
specific_deployment: Optional[bool] = False,
|
||||||
|
request_kwargs: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns the deployment based on routing strategy
|
Returns the deployment based on routing strategy
|
||||||
|
@ -3022,7 +3026,9 @@ class Router:
|
||||||
and self.lowestlatency_logger is not None
|
and self.lowestlatency_logger is not None
|
||||||
):
|
):
|
||||||
deployment = self.lowestlatency_logger.get_available_deployments(
|
deployment = self.lowestlatency_logger.get_available_deployments(
|
||||||
model_group=model, healthy_deployments=healthy_deployments
|
model_group=model,
|
||||||
|
healthy_deployments=healthy_deployments,
|
||||||
|
request_kwargs=request_kwargs,
|
||||||
)
|
)
|
||||||
elif (
|
elif (
|
||||||
self.routing_strategy == "usage-based-routing"
|
self.routing_strategy == "usage-based-routing"
|
||||||
|
|
|
@ -272,12 +272,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
||||||
healthy_deployments: list,
|
healthy_deployments: list,
|
||||||
messages: Optional[List[Dict[str, str]]] = None,
|
messages: Optional[List[Dict[str, str]]] = None,
|
||||||
input: Optional[Union[str, List]] = None,
|
input: Optional[Union[str, List]] = None,
|
||||||
|
request_kwargs: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a deployment with the lowest latency
|
Returns a deployment with the lowest latency
|
||||||
"""
|
"""
|
||||||
# get list of potential deployments
|
# get list of potential deployments
|
||||||
latency_key = f"{model_group}_map"
|
latency_key = f"{model_group}_map"
|
||||||
|
_latency_per_deployment = {}
|
||||||
|
|
||||||
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
|
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
|
||||||
|
|
||||||
|
@ -354,4 +356,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
||||||
elif item_latency < lowest_latency:
|
elif item_latency < lowest_latency:
|
||||||
lowest_latency = item_latency
|
lowest_latency = item_latency
|
||||||
deployment = _deployment
|
deployment = _deployment
|
||||||
|
|
||||||
|
# _latency_per_deployment is used for debuggig
|
||||||
|
_deployment_api_base = _deployment.get("litellm_params", {}).get(
|
||||||
|
"api_base", ""
|
||||||
|
)
|
||||||
|
_latency_per_deployment[_deployment_api_base] = item_latency
|
||||||
|
if request_kwargs is not None and "metadata" in request_kwargs:
|
||||||
|
request_kwargs["metadata"][
|
||||||
|
"_latency_per_deployment"
|
||||||
|
] = _latency_per_deployment
|
||||||
return deployment
|
return deployment
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue