feat(router.py): add routing capabilities for multiple deployments

2025-04-26 03:04:13 +00:00 · 2023-10-17 22:28:17 -07:00 · 2023-10-17 22:28:17 -07:00 · b20cf3171b
commit b20cf3171b
parent 0aa3563ccf
3 changed files with 251 additions and 0 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -0,0 +1,199 @@
+from typing import Union, List, Dict, Optional
+from datetime import datetime
+import litellm
+
+class Cache: 
+    """
+    Underlying dictionary for Router. This can either be a dictionary or a Redis Cache (if credentials are set).
+    """
+    def __init__(self, cache_config: dict) -> None:
+        self.cache_config = cache_config
+        if cache_config["type"] == "redis":
+            pass
+        elif cache_config["type"] == "local": 
+            self.usage_dict = {} 
+    def get(self, key: str):
+        return self.usage_dict.get(key, 0)
+    
+    def increment(self, key: str, increment_value: int, expiry: int): 
+        if self.cache_config["type"] == "redis":
+            pass
+        elif self.cache_config["type"] == "local": 
+            self.usage_dict[key] = self.usage_dict.get(key, 0)  + increment_value
+
+class Router: 
+    """
+    Example usage:
+    from litellm import Router
+    model_list = [{
+        "model_name": "gpt-3.5-turbo", # openai model name 
+        "litellm_params": { # params for litellm completion/embedding call 
+            "model": "azure/<your-deployment-name>", 
+            "api_key": <your-api-key>,
+            "api_version": <your-api-version>,
+            "api_base": <your-api-base>
+        },
+        "tpm": <your-model-tpm>, e.g. 240000
+        "rpm": <your-model-rpm>, e.g. 1800
+    }]
+
+    router = Router(model_list=model_list)
+    """
+    def __init__(self, 
+                 model_list: list,
+                 redis_host: Optional[str] = None,
+                 redis_port: Optional[int] = None,
+                 redis_password: Optional[str] = None) -> None:
+        self.model_list = model_list
+        if redis_host is not None and redis_port is not None and redis_password is not None: 
+            cache_config = {
+                    'type': 'redis',
+                    'host': redis_host,
+                    'port': redis_port,
+                    'password': redis_password
+            }
+        else: 
+            cache_config = {
+                "type": "local"
+            }
+        self.cache = Cache(cache_config)
+        litellm.cache = litellm.Cache(**cache_config)
+        litellm.success_callback = [self.deployment_callback]
+    
+    def completion(self,
+                   model: str,
+                   messages: List[Dict[str, str]],
+                   is_retry: Optional[bool] = False,
+                   is_fallback: Optional[bool] = False,
+                   is_async: Optional[bool] = False,
+                   **kwargs): 
+        """
+        Example usage: 
+        response = router.completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]
+        """
+
+        # pick the one that is available (lowest TPM/RPM)
+        deployment = self.get_available_deployment(model=model, messages=messages)
+
+        data = deployment["litellm_params"]
+        data["messages"] = messages
+        # call via litellm.completion() 
+        return litellm.completion(**data)
+
+    def embedding(self, 
+                  model: str,
+                  input: Union[str, List],
+                  is_async: Optional[bool] = False,
+                  **kwargs) -> Union[List[float], None]:
+        # pick the one that is available (lowest TPM/RPM)
+        deployment = self.get_available_deployment(model=model)
+
+        data = deployment["litellm_params"]
+        data["input"] = input
+        # call via litellm.embedding() 
+        return litellm.embedding(**data)
+    
+    def deployment_callback(
+        self,
+        kwargs,                 # kwargs to completion
+        completion_response,    # response from completion
+        start_time, end_time    # start/end time
+    ):
+        """
+        Function LiteLLM submits a callback to after a successful
+        completion. Purpose of this is ti update TPM/RPM usage per model
+        """
+        model_name = kwargs.get('model', None)  # i.e. azure/gpt35turbo
+        total_tokens = completion_response['usage']['total_tokens']
+        self._set_deployment_usage(model_name, total_tokens)
+
+    def get_available_deployment(self, 
+                               model: str, 
+                               messages: List[Dict[str, str]]): 
+        """
+        Returns a deployment with the lowest TPM/RPM usage.
+        """
+        # get list of potential deployments 
+        potential_deployments = [] 
+        for item in self.model_list: 
+            if item["model_name"] == model: 
+                potential_deployments.append(item)
+        
+        # set first model as current model
+        deployment = potential_deployments[0] 
+
+
+        # get model tpm, rpm limits
+        tpm = deployment["tpm"]
+        rpm = deployment["rpm"]
+
+        # get deployment current usage
+        current_tpm, current_rpm = self._get_deployment_usage(deployment_name=deployment["litellm_params"]["model"])
+
+        # get encoding 
+        token_count = litellm.token_counter(model=deployment["model_name"], messages=messages)
+        
+        
+        # if at model limit, return lowest used
+        if current_tpm + token_count > tpm or current_rpm + 1 >= rpm: 
+            # -----------------------
+            # Find lowest used model
+            # ----------------------
+            lowest_tpm = float('inf')
+            deployment = None
+
+            # Go through all the models to get tpm, rpm
+            for item in potential_deployments:
+                item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])
+
+                if item_tpm == 0:
+                    return item
+                elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]: 
+                    continue
+                elif item_tpm < lowest_tpm:
+                    lowest_tpm = item_tpm
+                    deployment = item
+        
+            # if none, raise exception 
+            if deployment is None: 
+                raise ValueError(f"No models available.")
+
+        # return model 
+        return deployment
+
+    def _get_deployment_usage(
+        self,
+        deployment_name: str
+    ):
+        # ------------
+        # Setup values
+        # ------------
+        current_minute = datetime.now().strftime("%H-%M")
+        tpm_key = f'{deployment_name}:tpm:{current_minute}'
+        rpm_key = f'{deployment_name}:rpm:{current_minute}'
+
+        # ------------
+        # Return usage
+        # ------------
+        tpm = self.cache.get(tpm_key)
+        rpm = self.cache.get(rpm_key)
+        return int(tpm), int(rpm)
+    
+    def _set_deployment_usage(
+        self,
+        model_name: str,
+        total_tokens: int
+    ):
+        # ------------
+        # Setup values
+        # ------------
+        current_minute = datetime.now().strftime("%H-%M")
+        ttl = 120  # 2 minutes
+        tpm_key = f'{model_name}:tpm:{current_minute}'
+        rpm_key = f'{model_name}:rpm:{current_minute}'
+
+        # ------------
+        # Update usage
+        # ------------
+        self.cache.increment(tpm_key, total_tokens, expiry=ttl)
+        self.cache.increment(rpm_key, 1, expiry=ttl)