feat(router.py): add server cooldown logic

2025-04-24 10:14:26 +00:00 · 2023-11-22 15:59:41 -08:00 · 2023-11-22 15:59:41 -08:00 · 3e76d4b422
commit 3e76d4b422
parent 4ece219ec5
5 changed files with 67 additions and 82 deletions
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -33,9 +33,6 @@ class CustomLogger:

    def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
        try: 
-            print_verbose(
-                    f"Custom Logger - Enters logging function for model {kwargs}"
-                )
            kwargs["model"] = model
            kwargs["messages"] = messages
            kwargs["log_event_type"] = "pre_api_call"
@ -52,9 +49,6 @@ class CustomLogger:
    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose, callback_func):
        # Method definition
        try:
-            print_verbose(
-                f"Custom Logger - Enters logging function for model {kwargs}"
-            )
            kwargs["log_event_type"] = "post_api_call"
            callback_func(
                kwargs, # kwargs to func
--- a/litellm/router.py
+++ b/litellm/router.py
@ -52,6 +52,7 @@ class Router:
            self.set_model_list(model_list)
            self.healthy_deployments: List = self.model_list
            self.deployment_latency_map = {}
+            self.cooldown_deployments = {} # {"gpt-3.5-turbo": time.time() when it failed / needed a cooldown} 
            for m in model_list: 
                self.deployment_latency_map[m["litellm_params"]["model"]] = 0
        
@ -88,6 +89,11 @@ class Router:
            litellm.success_callback.append(self.deployment_callback)
        else:
            litellm.success_callback = [self.deployment_callback]
+        
+        if type(litellm.failure_callback) == list:
+            litellm.failure_callback.append(self.deployment_callback_on_failure)
+        else:
+            litellm.failure_callback = [self.deployment_callback_on_failure]

    def _start_health_check_thread(self):
        """
@ -156,6 +162,25 @@ class Router:
        """
        Returns the deployment based on routing strategy
        """
+        ## get healthy deployments
+        ### get all deployments 
+        ### filter out the deployments currently cooling down 
+        healthy_deployments = [m for m in self.model_list if m["model_name"] == model]
+        current_time = time.time() 
+        iter = 0
+        deployments_to_remove = [] 
+        ### FIND UNHEALTHY DEPLOYMENTS
+        for deployment in healthy_deployments: 
+            deployment_name = deployment["litellm_params"]["model"]
+            if deployment_name in self.cooldown_deployments: 
+                if current_time >= self.cooldown_deployments[deployment_name] + 60: 
+                    self.cooldown_deployments.pop(deployment_name)
+                else: 
+                    deployments_to_remove.append(deployment)
+            iter += 1
+        ### FILTER OUT UNHEALTHY DEPLOYMENTS
+        for deployment in deployments_to_remove:
+            healthy_deployments.remove(deployment)
        if litellm.model_alias_map and model in litellm.model_alias_map:
            model = litellm.model_alias_map[
                model
@ -168,24 +193,15 @@ class Router:
            else: 
                raise ValueError("No models available.")
        elif self.routing_strategy == "simple-shuffle": 
-            potential_deployments = []
-            for item in self.model_list:
-                if item["model_name"] == model:
-                    potential_deployments.append(item)
-            item = random.choice(potential_deployments)
+            item = random.choice(healthy_deployments)
            return item or item[0]
        elif self.routing_strategy == "latency-based-routing": 
            returned_item = None
            lowest_latency = float('inf')
-            ### get potential deployments
-            potential_deployments = []
-            for item in self.model_list:
-                if item["model_name"] == model:
-                    potential_deployments.append(item)
            ### shuffles with priority for lowest latency
            # items_with_latencies = [('A', 10), ('B', 20), ('C', 30), ('D', 40)]
            items_with_latencies = [] 
-            for item in potential_deployments:
+            for item in healthy_deployments:
                items_with_latencies.append((item, self.deployment_latency_map[item["litellm_params"]["model"]]))
            returned_item = self.weighted_shuffle_by_latency(items_with_latencies)
            return returned_item
@ -256,7 +272,7 @@ class Router:
                                            reraise=True,
                                            after=after_callback)
                
-            return retryer(self.acompletion, *args, **kwargs)
+            return retryer(original_function, *args, **kwargs)
        except Exception as e: 
            raise Exception(f"Error in function_with_retries: {e}\n\nRetry Info: {retry_info}")

@ -406,7 +422,7 @@ class Router:
        custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None)  # i.e. azure
        if custom_llm_provider:
            model_name = f"{custom_llm_provider}/{model_name}"
-        self.deployment_latency_map[model_name] = float('inf')
+        self.cooldown_deployments[model_name] = time.time() # put deployment in cooldown mode

    def get_usage_based_available_deployment(self,
                               model: str,
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -16,46 +16,6 @@ from dotenv import load_dotenv

 load_dotenv()

-# def test_openai_only(): 
-# 	from litellm import completion
-# 	import time
-# 	completions = []
-# 	max_workers = 1000 # Adjust as needed
-# 	start_time = time.time()
-# 	print(f"Started test: {start_time}")
-# 	with ThreadPoolExecutor(max_workers=max_workers) as executor:
-# 		kwargs = {
-# 			"model": "gpt-3.5-turbo",
-# 			"messages": [{"role": "user", "content": """Context:
-
-# In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
-
-# Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
-
-# In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs. 
-
-# Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
-
-# The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
-
-# ---
-# Question: 
-
-# Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
-# 		}
-# 		for _ in range(10000):
-# 			future = executor.submit(completion, **kwargs)
-# 			completions.append(future)
-
-# 	# Retrieve the results from the futures
-# 	results = [future.result() for future in completions]
-# 	end_time = time.time()
-
-# 	print(f"Total Duration: {end_time-start_time}")
-
-# test_openai_only()
-
-
 # def test_multiple_deployments(): 
 # 	import concurrent, time
 # 	# litellm.set_verbose=True
@ -90,12 +50,14 @@ load_dotenv()
 # 		"rpm": 9000
 # 	}]

-# 	router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT")), routing_strategy="latency-based-routing") # type: ignore
+# 	router = Router(model_list=model_list, 
+# 				 redis_host=os.getenv("REDIS_HOST"), 
+# 				 redis_password=os.getenv("REDIS_PASSWORD"), 
+# 				 redis_port=int(os.getenv("REDIS_PORT")), 
+# 				 routing_strategy="simple-shuffle",
+# 				 num_retries=3) # type: ignore
 # 	# router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore
-
-# 	results = [] 
-# 	with ThreadPoolExecutor(max_workers=100) as executor:
-# 		kwargs = {
+# 	kwargs = {
 # 			"model": "gpt-3.5-turbo",
 # 			"messages": [{"role": "user", "content": """Context:

@ -113,7 +75,17 @@ load_dotenv()
 # Question: 

 # Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
-# 		}
+# 	}
+	
+# 	results = [] 
+
+# 	# for _ in range(10): 
+# 	# 	print(f"starting!!!")
+# 	# 	response = router.completion(**kwargs)
+# 	# 	results.append(response)
+	
+# 	# print(len(results))
+# 	with ThreadPoolExecutor(max_workers=100) as executor:

 # 		start_time = time.time()
 # 		for _ in range(1000):
@ -127,13 +99,16 @@ load_dotenv()
 # 				try:
 # 					result = future.result()
 # 					results.append(result)
-# 					del futures[future]
+# 					futures.pop(future)  # Remove the done future
 # 				except Exception as e:
 # 					print(f"Exception: {e}; traceback: {traceback.format_exc()}")
-# 					del futures[future]  # remove the done future
+# 					futures.pop(future)  # Remove the done future with exception
+
+# 			print(f"Remaining futures: {len(futures)}")

 # 		end_time = time.time() 
 # 		print(f"ELAPSED TIME: {end_time-start_time}")
+# 		print(f"results: {results}")
 # 		# Check results


@ -246,7 +221,7 @@ def test_acompletion_on_router():
 			{
 				"model_name": "gpt-3.5-turbo",
 				"litellm_params": {
-					"model": "chatgpt-v-2",
+					"model": "azure/chatgpt-v-2",
 					"api_key": os.getenv("AZURE_API_KEY"),
 					"api_base": os.getenv("AZURE_API_BASE"),
 					"api_version": os.getenv("AZURE_API_VERSION")
@ -260,14 +235,14 @@ def test_acompletion_on_router():
 			{"role": "user", "content": "What is the weather like in Boston?"}
 		]
 		start_time = time.time()
+		router = Router(model_list=model_list, 
+				redis_host=os.environ["REDIS_HOST"], 
+				redis_password=os.environ["REDIS_PASSWORD"], 
+				redis_port=os.environ["REDIS_PORT"], 
+				cache_responses=True, 
+				timeout=30,
+				routing_strategy="simple-shuffle")
 		async def get_response(): 
-			router = Router(model_list=model_list, 
-				   redis_host=os.environ["REDIS_HOST"], 
-				   redis_password=os.environ["REDIS_PASSWORD"], 
-				   redis_port=os.environ["REDIS_PORT"], 
-				   cache_responses=True, 
-				   timeout=30,
-				   routing_strategy="usage-based-routing")
 			response1 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
 			print(f"response1: {response1}")
 			response2 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3579,7 +3579,7 @@ def exception_type(
                        )
            elif custom_llm_provider == "anthropic":  # one of the anthropics
                if hasattr(original_exception, "message"):
-                    if "prompt is too long" in original_exception.message:
+                    if "prompt is too long" in original_exception.message or "prompt: length" in original_exception.message:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=original_exception.message, 
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -157,24 +157,24 @@
    "claude-instant-1.2": {
        "max_tokens": 100000,
        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.00000163,
-        "output_cost_per_token": 0.00000551,
+        "input_cost_per_token": 0.000000163,
+        "output_cost_per_token": 0.000000551,
        "litellm_provider": "anthropic",
        "mode": "chat"
    },
    "claude-2": {
        "max_tokens": 100000,
        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.00001102,
-        "output_cost_per_token": 0.00003268,
+        "input_cost_per_token": 0.000008,
+        "output_cost_per_token": 0.000024,
        "litellm_provider": "anthropic",
        "mode": "chat"
    },
    "claude-2.1": {
        "max_tokens": 200000,
        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.00001102,
-        "output_cost_per_token": 0.00003268,
+        "input_cost_per_token": 0.000008,
+        "output_cost_per_token": 0.000024,
        "litellm_provider": "anthropic",
        "mode": "chat"
    },