mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
feat(lowest_latency.py): support expanded time window for latency based routing
uses a 1hr avg. of latency for deployments, to determine which to route to https://github.com/BerriAI/litellm/issues/1361
This commit is contained in:
parent
c6b9cf55b5
commit
9ddcdc4716
2 changed files with 21 additions and 10 deletions
|
@ -50,9 +50,12 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
|
||||
## Latency
|
||||
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
|
||||
request_count_dict[id] = response_ms
|
||||
if id in request_count_dict and isinstance(request_count_dict[id], list):
|
||||
request_count_dict[id] = request_count_dict[id].append(response_ms)
|
||||
else:
|
||||
request_count_dict[id] = [response_ms]
|
||||
|
||||
self.router_cache.set_cache(key=latency_key, value=request_count_dict)
|
||||
self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window
|
||||
|
||||
### TESTING ###
|
||||
if self.test_flag:
|
||||
|
@ -90,9 +93,12 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
|
||||
## Latency
|
||||
request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
|
||||
request_count_dict[id] = response_ms
|
||||
if id in request_count_dict and isinstance(request_count_dict[id], list):
|
||||
request_count_dict[id] = request_count_dict[id] + [response_ms]
|
||||
else:
|
||||
request_count_dict[id] = [response_ms]
|
||||
|
||||
self.router_cache.set_cache(key=latency_key, value=request_count_dict)
|
||||
self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window
|
||||
|
||||
### TESTING ###
|
||||
if self.test_flag:
|
||||
|
@ -123,7 +129,7 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
for d in healthy_deployments:
|
||||
## if healthy deployment not yet used
|
||||
if d["model_info"]["id"] not in all_deployments:
|
||||
all_deployments[d["model_info"]["id"]] = 0
|
||||
all_deployments[d["model_info"]["id"]] = [0]
|
||||
|
||||
for item, item_latency in all_deployments.items():
|
||||
## get the item from model list
|
||||
|
@ -134,10 +140,15 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
|
||||
if _deployment is None:
|
||||
continue # skip to next one
|
||||
|
||||
if isinstance(item_latency, timedelta):
|
||||
item_latency = float(item_latency.total_seconds())
|
||||
|
||||
|
||||
# get average latency
|
||||
total = 0
|
||||
for _call_latency in item_latency:
|
||||
if isinstance(_call_latency, timedelta):
|
||||
total += float(_call_latency.total_seconds())
|
||||
elif isinstance(_call_latency, float):
|
||||
total += _call_latency
|
||||
item_latency = total/len(item_latency)
|
||||
if item_latency == 0:
|
||||
deployment = _deployment
|
||||
break
|
||||
|
|
|
@ -49,7 +49,7 @@ def test_latency_updated():
|
|||
end_time=end_time,
|
||||
)
|
||||
latency_key = f"{model_group}_latency_map"
|
||||
assert end_time - start_time == test_cache.get_cache(key=latency_key)[deployment_id]
|
||||
assert end_time - start_time == test_cache.get_cache(key=latency_key)[deployment_id][0]
|
||||
|
||||
|
||||
# test_tpm_rpm_updated()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue