fix(router.py): allow user to control the latency routing time window

2024-01-10 08:48:22 +05:30 · 2024-01-10 08:48:22 +05:30 · fe632c08a4
commit fe632c08a4
parent 2b3fc15fa9
3 changed files with 59 additions and 7 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -105,6 +105,7 @@ class Router:
            "usage-based-routing",
            "latency-based-routing",
        ] = "simple-shuffle",
+        routing_strategy_args: dict = {}, # just for latency-based routing
    ) -> None:
        self.set_verbose = set_verbose
        self.deployment_names: List = (
@ -217,7 +218,7 @@ class Router:
                litellm.callbacks.append(self.lowesttpm_logger)  # type: ignore
        elif routing_strategy == "latency-based-routing":
            self.lowestlatency_logger = LowestLatencyLoggingHandler(
-                router_cache=self.cache, model_list=self.model_list
+                router_cache=self.cache, model_list=self.model_list, routing_args=routing_strategy_args
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestlatency_logger)  # type: ignore
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -1,6 +1,6 @@
 #### What this does ####
 #   picks based on response time (for streaming, this is time to first token)
-
+from pydantic import BaseModel, Extra, Field, root_validator
 import dotenv, os, requests, random
 from typing import Optional
 from datetime import datetime, timedelta
@ -10,16 +10,30 @@ import traceback
 from litellm.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger

+class LiteLLMBase(BaseModel):
+    """
+    Implements default functions, all pydantic objects should have.
+    """
+
+    def json(self, **kwargs):
+        try:
+            return self.model_dump()  # noqa
+        except:
+            # if using pydantic v1
+            return self.dict()
+
+class RoutingArgs(LiteLLMBase):
+    ttl: int = 1 * 60 * 60  # 1 hour

 class LowestLatencyLoggingHandler(CustomLogger):
    test_flag: bool = False
    logged_success: int = 0
    logged_failure: int = 0
-    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour

-    def __init__(self, router_cache: DualCache, model_list: list):
+    def __init__(self, router_cache: DualCache, model_list: list, routing_args: dict={}):
        self.router_cache = router_cache
        self.model_list = model_list
+        self.routing_args = RoutingArgs(**routing_args)

    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
@ -55,7 +69,7 @@ class LowestLatencyLoggingHandler(CustomLogger):
                else:
                    request_count_dict[id] = [response_ms]

-                self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window 
+                self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.routing_args.ttl) # reset map within window 

                ### TESTING ###
                if self.test_flag:
@ -98,7 +112,7 @@ class LowestLatencyLoggingHandler(CustomLogger):
                else:
                    request_count_dict[id] = [response_ms]

-                self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.default_cache_time_seconds) # reset map within window 
+                self.router_cache.set_cache(key=latency_key, value=request_count_dict, ttl=self.routing_args.ttl) # reset map within window 

                ### TESTING ###
                if self.test_flag:
--- a/litellm/tests/test_lowest_latency_routing.py
+++ b/litellm/tests/test_lowest_latency_routing.py
@ -51,9 +51,46 @@ def test_latency_updated():
    latency_key = f"{model_group}_latency_map"
    assert end_time - start_time == test_cache.get_cache(key=latency_key)[deployment_id][0]

-
 # test_tpm_rpm_updated()

+def test_latency_updated_custom_ttl():
+    """
+    Invalidate the cached request. 
+
+    Test that the cache is empty
+    """
+    test_cache = DualCache()
+    model_list = []
+    cache_time = 3
+    lowest_latency_logger = LowestLatencyLoggingHandler(
+        router_cache=test_cache, model_list=model_list, routing_args={"ttl": cache_time}
+    )
+    model_group = "gpt-3.5-turbo"
+    deployment_id = "1234"
+    kwargs = {
+        "litellm_params": {
+            "metadata": {
+                "model_group": "gpt-3.5-turbo",
+                "deployment": "azure/chatgpt-v-2",
+            },
+            "model_info": {"id": deployment_id},
+        }
+    }
+    start_time = time.time()
+    response_obj = {"usage": {"total_tokens": 50}}
+    time.sleep(5)
+    end_time = time.time()
+    lowest_latency_logger.log_success_event(
+        response_obj=response_obj,
+        kwargs=kwargs,
+        start_time=start_time,
+        end_time=end_time,
+    )
+    latency_key = f"{model_group}_latency_map"
+    assert isinstance(test_cache.get_cache(key=latency_key), dict)
+    time.sleep(cache_time)
+    assert test_cache.get_cache(key=latency_key) is None
+

 def test_get_available_deployments():
    test_cache = DualCache()