feat(lowest_latency.py): support expanded time window for latency based routing

uses a 1hr avg. of latency for deployments, to determine which to route to

https://github.com/BerriAI/litellm/issues/1361
This commit is contained in:
Krrish Dholakia 2024-01-08 22:52:32 +05:30 committed by ishaan-jaff
parent c6b9cf55b5
commit 9ddcdc4716
2 changed files with 21 additions and 10 deletions

View file

@ -50,9 +50,12 @@ class LowestLatencyLoggingHandler(CustomLogger):
## Latency
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
request_count_dict[id] = response_ms
if id in request_count_dict and isinstance(request_count_dict[id], list):
request_count_dict[id] = request_count_dict[id].append(response_ms)
else:
request_count_dict[id] = [response_ms]
self.router_cache.set_cache(key=latency_key, value=request_count_dict)
self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window
### TESTING ###
if self.test_flag:
@ -90,9 +93,12 @@ class LowestLatencyLoggingHandler(CustomLogger):
## Latency
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
request_count_dict[id] = response_ms
if id in request_count_dict and isinstance(request_count_dict[id], list):
request_count_dict[id] = request_count_dict[id] + [response_ms]
else:
request_count_dict[id] = [response_ms]
self.router_cache.set_cache(key=latency_key, value=request_count_dict)
self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window
### TESTING ###
if self.test_flag:
@ -123,7 +129,7 @@ class LowestLatencyLoggingHandler(CustomLogger):
for d in healthy_deployments:
## if healthy deployment not yet used
if d["model_info"]["id"] not in all_deployments:
all_deployments[d["model_info"]["id"]] = 0
all_deployments[d["model_info"]["id"]] = [0]
for item, item_latency in all_deployments.items():
## get the item from model list
@ -134,10 +140,15 @@ class LowestLatencyLoggingHandler(CustomLogger):
if _deployment is None:
continue # skip to next one
if isinstance(item_latency, timedelta):
item_latency = float(item_latency.total_seconds())
# get average latency
total = 0
for _call_latency in item_latency:
if isinstance(_call_latency, timedelta):
total += float(_call_latency.total_seconds())
elif isinstance(_call_latency, float):
total += _call_latency
item_latency = total/len(item_latency)
if item_latency == 0:
deployment = _deployment
break

View file

@ -49,7 +49,7 @@ def test_latency_updated():
end_time=end_time,
)
latency_key = f"{model_group}_latency_map"
assert end_time - start_time == test_cache.get_cache(key=latency_key)[deployment_id]
assert end_time - start_time == test_cache.get_cache(key=latency_key)[deployment_id][0]
# test_tpm_rpm_updated()