fix(router.py): add support for cooldowns with redis

2025-04-24 18:24:20 +00:00 · 2023-11-22 19:54:15 -08:00 · 2023-11-22 19:54:15 -08:00 · 5d5ca9f7ef
commit 5d5ca9f7ef
parent cb41b14cc2
3 changed files with 161 additions and 121 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -9,7 +9,7 @@

 import litellm
 import time, logging
-import json, traceback
+import json, traceback, ast

 def get_prompt(*args, **kwargs):
    # make this safe checks, it should not throw any exceptions
@ -53,8 +53,12 @@ class RedisCache(BaseCache):
            if cached_response != None:
                # cached_response is in `b{} convert it to ModelResponse
                cached_response = cached_response.decode("utf-8")  # Convert bytes to string
-                cached_response = json.loads(cached_response)  # Convert string to dictionary
-                cached_response['cache'] = True  # set cache-hit flag to True
+                try: 
+                    cached_response = json.loads(cached_response)  # Convert string to dictionary
+                except: 
+                    cached_response = ast.literal_eval(cached_response)
+                if isinstance(cached_response, dict): 
+                    cached_response['cache'] = True  # set cache-hit flag to True
                return cached_response
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
@ -224,5 +228,5 @@ class Cache:
                if isinstance(result, litellm.ModelResponse):
                    result = result.model_dump_json()
                self.cache.set_cache(cache_key, result, **kwargs)
-        except:
+        except Exception as e:
            pass
--- a/litellm/router.py
+++ b/litellm/router.py
@ -83,14 +83,14 @@ class Router:
        if cache_responses:
            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
            self.cache_responses = cache_responses
-        self.cache = litellm.Cache(cache_config) # use Redis for tracking load balancing
+        self.cache = litellm.Cache(**cache_config) # use Redis for tracking load balancing
        ## USAGE TRACKING ## 
-        if type(litellm.success_callback) == list:
+        if isinstance(litellm.success_callback, list):
            litellm.success_callback.append(self.deployment_callback)
        else:
            litellm.success_callback = [self.deployment_callback]
        
-        if type(litellm.failure_callback) == list:
+        if isinstance(litellm.failure_callback, list):
            litellm.failure_callback.append(self.deployment_callback_on_failure)
        else:
            litellm.failure_callback = [self.deployment_callback_on_failure]
@ -169,14 +169,12 @@ class Router:
        current_time = time.time() 
        iter = 0
        deployments_to_remove = [] 
+        cooldown_deployments = self._get_cooldown_deployments()
        ### FIND UNHEALTHY DEPLOYMENTS
        for deployment in healthy_deployments: 
            deployment_name = deployment["litellm_params"]["model"]
-            if deployment_name in self.cooldown_deployments: 
-                if current_time >= self.cooldown_deployments[deployment_name] + 60: 
-                    self.cooldown_deployments.pop(deployment_name)
-                else: 
-                    deployments_to_remove.append(deployment)
+            if deployment_name in cooldown_deployments: 
+                deployments_to_remove.append(deployment)
            iter += 1
        ### FILTER OUT UNHEALTHY DEPLOYMENTS
        for deployment in deployments_to_remove:
@ -245,36 +243,31 @@ class Router:
                raise e
    
    def function_with_retries(self, *args, **kwargs): 
-        try:
-            import tenacity
-        except Exception as e:
-            raise Exception(f"tenacity import failed please run `pip install tenacity`. Error{e}")
-        
-        retry_info = {"attempts": 0, "final_result": None}
+        # we'll backoff exponentially with each retry
+        backoff_factor = 1
+        original_exception = kwargs.pop("original_exception")
+        original_function = kwargs.pop("original_function")
+        for current_attempt in range(self.num_retries):
+            self.num_retries -= 1 # decrement the number of retries
+            try:
+                # if the function call is successful, no exception will be raised and we'll break out of the loop
+                response = original_function(*args, **kwargs)
+                return response

-        def after_callback(retry_state):
-            retry_info["attempts"] = retry_state.attempt_number
-            retry_info["final_result"] = retry_state.outcome.result()
+            except openai.RateLimitError as e:
+                # on RateLimitError we'll wait for an exponential time before trying again
+                time.sleep(backoff_factor)

-        if 'model' not in kwargs or 'messages' not in kwargs:
-            raise ValueError("'model' and 'messages' must be included as keyword arguments")
-        
-        try: 
-            original_exception = kwargs.pop("original_exception")
-            original_function = kwargs.pop("original_function")
-            if isinstance(original_exception, openai.RateLimitError):
-                retryer = tenacity.Retrying(wait=tenacity.wait_exponential(multiplier=1, max=10), 
-                                            stop=tenacity.stop_after_attempt(self.num_retries), 
-                                            reraise=True,
-                                            after=after_callback)
-            elif isinstance(original_exception, openai.APIError):
-                retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(self.num_retries), 
-                                            reraise=True,
-                                            after=after_callback)
-                
-            return retryer(original_function, *args, **kwargs)
-        except Exception as e: 
-            raise Exception(f"Error in function_with_retries: {e}\n\nRetry Info: {retry_info}")
+                # increase backoff factor for next run
+                backoff_factor *= 2
+
+            except openai.APIError as e:
+                # on APIError we immediately retry without any wait, change this if necessary
+                pass
+
+            except Exception as e:
+                # for any other exception types, don't retry
+                raise e

    ### COMPLETION + EMBEDDING FUNCTIONS

@ -422,7 +415,48 @@ class Router:
        custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None)  # i.e. azure
        if custom_llm_provider:
            model_name = f"{custom_llm_provider}/{model_name}"
-        self.cooldown_deployments[model_name] = time.time() # put deployment in cooldown mode
+        
+        self._set_cooldown_deployments(model_name)
+
+    def _set_cooldown_deployments(self, 
+                                  deployment: str):
+        """
+        Add a model to the list of models being cooled down for that minute
+        """
+        
+        current_minute = datetime.now().strftime("%H-%M")
+        # get the current cooldown list for that minute
+        cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
+        cached_value = self.cache.get_cache(cache_key=cooldown_key)
+
+        # update value
+        try:
+            if deployment in cached_value: 
+                pass
+            else: 
+                cached_value = cached_value + [deployment]
+                # save updated value
+                self.cache.add_cache(result=cached_value, cache_key=cooldown_key, ttl=60) 
+        except:
+            cached_value = [deployment]
+
+            # save updated value
+            self.cache.add_cache(result=cached_value, cache_key=cooldown_key, ttl=60) 
+
+    def _get_cooldown_deployments(self):
+        """
+        Get the list of models being cooled down for this minute
+        """
+        current_minute = datetime.now().strftime("%H-%M")
+        # get the current cooldown list for that minute
+        cooldown_key = f"{current_minute}:cooldown_models"
+
+        # ----------------------
+        # Return cooldown models
+        # ----------------------
+        cooldown_models = self.cache.get_cache(cache_key=cooldown_key) or []
+
+        return cooldown_models

    def get_usage_based_available_deployment(self,
                               model: str,
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -16,103 +16,105 @@ from dotenv import load_dotenv

 load_dotenv()

-# def test_multiple_deployments(): 
-# 	import concurrent, time
-# 	# litellm.set_verbose=True
-# 	futures = {}
-# 	model_list = [{ # list of model deployments 
-# 		"model_name": "gpt-3.5-turbo", # openai model name 
-# 		"litellm_params": { # params for litellm completion/embedding call 
-# 			"model": "azure/chatgpt-v-2", 
-# 			"api_key": os.getenv("AZURE_API_KEY"),
-# 			"api_version": os.getenv("AZURE_API_VERSION"),
-# 			"api_base": os.getenv("AZURE_API_BASE")
-# 		},
-# 		"tpm": 240000,
-# 		"rpm": 1800
-# 	}, {
-# 		"model_name": "gpt-3.5-turbo", # openai model name 
-# 		"litellm_params": { # params for litellm completion/embedding call 
-# 			"model": "azure/chatgpt-functioncalling", 
-# 			"api_key": os.getenv("AZURE_API_KEY"),
-# 			"api_version": os.getenv("AZURE_API_VERSION"),
-# 			"api_base": os.getenv("AZURE_API_BASE")
-# 		},
-# 		"tpm": 240000,
-# 		"rpm": 1800
-# 	}, {
-# 		"model_name": "gpt-3.5-turbo", # openai model name 
-# 		"litellm_params": { # params for litellm completion/embedding call 
-# 			"model": "gpt-3.5-turbo", 
-# 			"api_key": os.getenv("OPENAI_API_KEY"),
-# 		},
-# 		"tpm": 1000000,
-# 		"rpm": 9000
-# 	}]
+def test_multiple_deployments(): 
+	import concurrent, time
+	# litellm.set_verbose=True
+	futures = {}
+	model_list = [{ # list of model deployments 
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
+			"api_key": "bad-key",
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE")
+		},
+		"tpm": 240000,
+		"rpm": 1800
+	}, 
+	# {
+	# 	"model_name": "gpt-3.5-turbo", # openai model name 
+	# 	"litellm_params": { # params for litellm completion/embedding call 
+	# 		"model": "azure/chatgpt-functioncalling", 
+	# 		"api_key": "bad-key",
+	# 		"api_version": os.getenv("AZURE_API_VERSION"),
+	# 		"api_base": os.getenv("AZURE_API_BASE")
+	# 	},
+	# 	"tpm": 240000,
+	# 	"rpm": 1800
+	# }, 
+	{
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo", 
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+		"tpm": 1000000,
+		"rpm": 9000
+	}
+	]

-# 	router = Router(model_list=model_list, 
-# 				 redis_host=os.getenv("REDIS_HOST"), 
-# 				 redis_password=os.getenv("REDIS_PASSWORD"), 
-# 				 redis_port=int(os.getenv("REDIS_PORT")), 
-# 				 routing_strategy="simple-shuffle",
-# 				 num_retries=3) # type: ignore
-# 	# router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore
-# 	kwargs = {
-# 			"model": "gpt-3.5-turbo",
-# 			"messages": [{"role": "user", "content": """Context:
+	router = Router(model_list=model_list, 
+				 redis_host=os.getenv("REDIS_HOST"), 
+				 redis_password=os.getenv("REDIS_PASSWORD"), 
+				 redis_port=int(os.getenv("REDIS_PORT")), 
+				 routing_strategy="simple-shuffle",
+				 num_retries=1) # type: ignore
+	# router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore
+	kwargs = {
+			"model": "gpt-3.5-turbo",
+			"messages": [{"role": "user", "content": """Context:

-# In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
+In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.

-# Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
+Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.

-# In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs. 
+In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs. 

-# Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
+Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.

-# The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
+The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.

-# ---
-# Question: 
+---
+Question: 

-# Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
-# 	}
+Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
+	}
 	
-# 	results = [] 
+	results = [] 

-# 	# for _ in range(10): 
-# 	# 	print(f"starting!!!")
-# 	# 	response = router.completion(**kwargs)
-# 	# 	results.append(response)
+	for _ in range(10): 
+		print(f"starting!!!")
+		response = router.completion(**kwargs)
+		results.append(response)
 	
-# 	# print(len(results))
-# 	with ThreadPoolExecutor(max_workers=100) as executor:
+	# print(len(results))
+	# with ThreadPoolExecutor(max_workers=100) as executor:

-# 		start_time = time.time()
-# 		for _ in range(1000):
-# 			future = executor.submit(router.completion, **kwargs)
-# 			futures[future] = future
+	# 	start_time = time.time()
+	# 	for _ in range(1000):
+	# 		future = executor.submit(router.completion, **kwargs)
+	# 		futures[future] = future

-# 		# Retrieve the results from the futures
-# 		while futures:
-# 			done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
-# 			for future in done:
-# 				try:
-# 					result = future.result()
-# 					results.append(result)
-# 					futures.pop(future)  # Remove the done future
-# 				except Exception as e:
-# 					print(f"Exception: {e}; traceback: {traceback.format_exc()}")
-# 					futures.pop(future)  # Remove the done future with exception
+	# 	# Retrieve the results from the futures
+	# 	while futures:
+	# 		done, not_done = concurrent.futures.wait(futures, timeout=10, return_when=concurrent.futures.FIRST_COMPLETED)
+	# 		for future in done:
+	# 			try:
+	# 				result = future.result()
+	# 				results.append(result)
+	# 				futures.pop(future)  # Remove the done future
+	# 			except Exception as e:
+	# 				print(f"Exception: {e}; traceback: {traceback.format_exc()}")
+	# 				futures.pop(future)  # Remove the done future with exception

-# 			print(f"Remaining futures: {len(futures)}")
+	# 		print(f"Remaining futures: {len(futures)}")

-# 		end_time = time.time() 
-# 		print(f"ELAPSED TIME: {end_time-start_time}")
-# 		print(f"results: {results}")
-# 		# Check results
+	# 	end_time = time.time() 
+	# 	print(f"ELAPSED TIME: {end_time-start_time}")
+		# Check results


-# test_multiple_deployments()
+test_multiple_deployments()
 ### FUNCTION CALLING 

 def test_function_calling():