feat(scheduler.py): support redis caching for req. prioritization

enables req. prioritization to work across multiple instances of litellm
2024-06-06 14:19:06 -07:00 · 2024-06-06 14:19:06 -07:00 · b590e6607c
commit b590e6607c
parent 422b4b1728
2 changed files with 19 additions and 10 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -220,8 +220,6 @@ class Router:
            []
        )  # names of models under litellm_params. ex. azure/chatgpt-v-2
        self.deployment_latency_map = {}
        ### SCHEDULER ###
        self.scheduler = Scheduler(polling_interval=polling_interval)
        ### CACHING ###
        cache_type: Literal["local", "redis"] = "local"  # default to an in-memory cache
        redis_cache = None
@ -259,6 +257,10 @@ class Router:
            redis_cache=redis_cache, in_memory_cache=InMemoryCache()
        )  # use a dual cache (Redis+In-Memory) for tracking cooldowns, usage, etc.
        ### SCHEDULER ###
        self.scheduler = Scheduler(
            polling_interval=polling_interval, redis_cache=redis_cache
        )
        self.default_deployment = None  # use this to track the users default deployment, when they want to use model = *
        self.default_max_parallel_requests = default_max_parallel_requests
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@ -1,13 +1,14 @@
-import heapq, time
+import heapq
 from pydantic import BaseModel
 from typing import Optional
 import enum
-from litellm.caching import DualCache
+from litellm.caching import DualCache, RedisCache
 from litellm import print_verbose
 class SchedulerCacheKeys(enum.Enum):
    queue = "scheduler:queue"
    default_in_memory_ttl = 5  # cache queue in-memory for 5s when redis cache available
 class DefaultPriorities(enum.Enum):
@ -25,18 +26,24 @@ class FlowItem(BaseModel):
 class Scheduler:
    cache: DualCache
-    def __init__(self, polling_interval: Optional[float] = None):
+    def __init__(
        self,
        polling_interval: Optional[float] = None,
        redis_cache: Optional[RedisCache] = None,
    ):
        """
        polling_interval: float or null - frequency of polling queue. Default is 3ms.
        """
        self.queue: list = []
-        self.cache = DualCache()
+        default_in_memory_ttl: Optional[float] = None
        if redis_cache is not None:
            # if redis-cache available frequently poll that instead of using in-memory.
            default_in_memory_ttl = SchedulerCacheKeys.default_in_memory_ttl.value
        self.cache = DualCache(
            redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
        )
        self.polling_interval = polling_interval or 0.03  # default to 3ms
    def update_variables(self, cache: Optional[DualCache] = None):
        if cache is not None:
            self.cache = cache
    async def add_request(self, request: FlowItem):
        # We use the priority directly, as lower values indicate higher priority
        # get the queue