bump: version 0.13.6.dev3 → 0.13.6

2023-11-06 18:19:09 -08:00 · 2023-11-06 18:19:09 -08:00 · 6a45879932
commit 6a45879932
parent 81d3230651
9 changed files with 126 additions and 143 deletions
--- a/dist/litellm-0.13.6.dev1-py3-none-any.whl
+++ b/dist/litellm-0.13.6.dev1-py3-none-any.whl
--- a/dist/litellm-0.13.6.dev1.tar.gz
+++ b/dist/litellm-0.13.6.dev1.tar.gz
--- a/dist/litellm-0.13.6.dev2-py3-none-any.whl
+++ b/dist/litellm-0.13.6.dev2-py3-none-any.whl
--- a/dist/litellm-0.13.6.dev2.tar.gz
+++ b/dist/litellm-0.13.6.dev2.tar.gz
--- a/dist/litellm-0.13.6.dev3-py3-none-any.whl
+++ b/dist/litellm-0.13.6.dev3-py3-none-any.whl
--- a/dist/litellm-0.13.6.dev3.tar.gz
+++ b/dist/litellm-0.13.6.dev3.tar.gz
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1,8 +1,8 @@
 from datetime import datetime
 from typing import Dict, List, Optional, Union
-
+import random, threading, time
 import litellm
-
+import logging

 class Router:
    """
@ -16,8 +16,6 @@ class Router:
            "api_version": <your-api-version>,
            "api_base": <your-api-base>
        },
-        "tpm": <your-model-tpm>, e.g. 240000
-        "rpm": <your-model-rpm>, e.g. 1800
    }]

    router = Router(model_list=model_list)
@ -34,6 +32,11 @@ class Router:
                 cache_responses: bool = False) -> None:
        if model_list:
            self.set_model_list(model_list)
+            self.healthy_deployments = []
+        ### HEALTH CHECK THREAD ### - commenting out as further testing required
+        # self._start_health_check_thread()
+
+        ### CACHING ###
        if redis_host is not None and redis_port is not None and redis_password is not None:
            cache_config = {
                    'type': 'redis',
@ -45,11 +48,80 @@ class Router:
            cache_config = {
                "type": "local"
            }
-        self.cache = litellm.Cache(cache_config) # use Redis for tracking load balancing
        if cache_responses:
            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
            self.cache_responses = cache_responses
-        litellm.success_callback = [self.deployment_callback]
+
+
+    def _start_health_check_thread(self):
+        """
+        Starts a separate thread to perform health checks periodically.
+        """
+        health_check_thread = threading.Thread(target=self._perform_health_checks, daemon=True)
+        health_check_thread.start()
+    
+    def _perform_health_checks(self):
+        """
+        Periodically performs health checks on the servers.
+        Updates the list of healthy servers accordingly.
+        """
+        while True:
+            self.healthy_deployments = self._health_check()
+            # Adjust the time interval based on your needs
+            time.sleep(15)
+        
+    def _health_check(self):
+        """
+        Performs a health check on the deployments
+        Returns the list of healthy deployments
+        """
+        healthy_deployments = []
+        for deployment in self.model_list: 
+            litellm_args = deployment["litellm_params"]
+            try: 
+                start_time = time.time()
+                litellm.completion(messages=[{"role": "user", "content": ""}], max_tokens=1, **litellm_args) # hit the server with a blank message to see how long it takes to respond
+                end_time = time.time() 
+                response_time = end_time - start_time
+                logging.debug(f"response_time: {response_time}")
+                healthy_deployments.append((deployment, response_time))
+                healthy_deployments.sort(key=lambda x: x[1])
+            except Exception as e: 
+                pass
+        return healthy_deployments
+
+    def set_model_list(self, model_list: list):
+        self.model_list = model_list
+        self.model_names = [m["model_name"] for m in model_list]
+
+    def get_model_names(self):
+        return self.model_names
+
+    def get_available_deployment(self,
+                               model: str,
+                               messages: Optional[List[Dict[str, str]]] = None,
+                               input: Optional[Union[str, List]] = None):
+        """
+        Returns the deployment with the shortest queue 
+        """
+        ### COMMENTING OUT AS IT NEEDS FURTHER TESTING
+        # logging.debug(f"self.healthy_deployments: {self.healthy_deployments}")
+        # if len(self.healthy_deployments) > 0:
+        #     for item in self.healthy_deployments:
+        #         print(f"item: {item}")
+        #         if item[0]["model_name"] == model: # first one in queue will be the one with the most availability
+        #             return item
+        # else: 
+        potential_deployments = []
+        for item in self.model_list:
+            if item["model_name"] == model:
+                potential_deployments.append(item)
+        item = random.choice(potential_deployments)
+        return item
+        
+        raise ValueError("No models available.")
+
+    ### COMPLETION + EMBEDDING FUNCTIONS

    def completion(self,
                   model: str,
@ -66,8 +138,12 @@ class Router:
        deployment = self.get_available_deployment(model=model, messages=messages)
        data = deployment["litellm_params"]
        # call via litellm.completion()
+        # return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
+        # litellm.set_verbose = True
        return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})

+
+
    async def acompletion(self,
                    model: str,
                    messages: List[Dict[str, str]],
@ -117,127 +193,3 @@ class Router:

        data = deployment["litellm_params"]
        return await litellm.aembedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})
-
-    def set_model_list(self, model_list: list):
-        self.model_list = model_list
-        self.model_names = [m["model_name"] for m in model_list]
-
-    def get_model_names(self):
-        return self.model_names
-
-    def deployment_callback(
-        self,
-        kwargs,                 # kwargs to completion
-        completion_response,    # response from completion
-        start_time, end_time    # start/end time
-    ):
-        """
-        Function LiteLLM submits a callback to after a successful
-        completion. Purpose of this is ti update TPM/RPM usage per model
-        """
-        model_name = kwargs.get('model', None)  # i.e. gpt35turbo
-        custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None)  # i.e. azure
-        if custom_llm_provider:
-            model_name = f"{custom_llm_provider}/{model_name}"
-        total_tokens = completion_response['usage']['total_tokens']
-        self._set_deployment_usage(model_name, total_tokens)
-
-    def get_available_deployment(self,
-                               model: str,
-                               messages: Optional[List[Dict[str, str]]] = None,
-                               input: Optional[Union[str, List]] = None):
-        """
-        Returns a deployment with the lowest TPM/RPM usage.
-        """
-        # get list of potential deployments
-        potential_deployments = []
-        for item in self.model_list:
-            if item["model_name"] == model:
-                potential_deployments.append(item)
-
-        # set first model as current model to calculate token count
-        deployment = potential_deployments[0]
-
-        # get encoding
-        token_count = 0
-        if messages is not None:
-            token_count = litellm.token_counter(model=deployment["model_name"], messages=messages)
-        elif input is not None:
-            if isinstance(input, List):
-                input_text = "".join(text for text in input)
-            else:
-                input_text = input
-            token_count = litellm.token_counter(model=deployment["model_name"], text=input_text)
-
-        # -----------------------
-        # Find lowest used model
-        # ----------------------
-        lowest_tpm = float("inf")
-        deployment = None
-
-        # Go through all the models to get tpm, rpm
-        for item in potential_deployments:
-            item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])
-
-            if item_tpm == 0:
-                return item
-            elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]:
-                continue
-            elif item_tpm < lowest_tpm:
-                lowest_tpm = item_tpm
-                deployment = item
-
-        # if none, raise exception
-        if deployment is None:
-            raise ValueError("No models available.")
-
-        # return model
-        return deployment
-
-    def _get_deployment_usage(
-        self,
-        deployment_name: str
-    ):
-        # ------------
-        # Setup values
-        # ------------
-        current_minute = datetime.now().strftime("%H-%M")
-        tpm_key = f'{deployment_name}:tpm:{current_minute}'
-        rpm_key = f'{deployment_name}:rpm:{current_minute}'
-
-        # ------------
-        # Return usage
-        # ------------
-        tpm = self.cache.get_cache(cache_key=tpm_key) or 0
-        rpm = self.cache.get_cache(cache_key=rpm_key) or 0
-
-        return int(tpm), int(rpm)
-
-    def increment(self, key: str, increment_value: int):
-        # get value
-        cached_value = self.cache.get_cache(cache_key=key)
-        # update value
-        try:
-            cached_value = cached_value + increment_value
-        except:
-            cached_value = increment_value
-        # save updated value
-        self.cache.add_cache(result=cached_value, cache_key=key, ttl=self.default_cache_time_seconds)
-
-    def _set_deployment_usage(
-        self,
-        model_name: str,
-        total_tokens: int
-    ):
-        # ------------
-        # Setup values
-        # ------------
-        current_minute = datetime.now().strftime("%H-%M")
-        tpm_key = f'{model_name}:tpm:{current_minute}'
-        rpm_key = f'{model_name}:rpm:{current_minute}'
-
-        # ------------
-        # Update usage
-        # ------------
-        self.increment(tpm_key, total_tokens)
-        self.increment(rpm_key, 1)
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -1,7 +1,7 @@
 #### What this tests ####
 #    This tests calling batch_completions by running 100 messages together

-import sys, os
+import sys, os, time
 import traceback, asyncio
 import pytest
 sys.path.insert(
@ -57,6 +57,9 @@ load_dotenv()


 def test_multiple_deployments(): 
+	import concurrent
+	# litellm.set_verbose=True
+	futures = {}
 	model_list = [{ # list of model deployments 
 		"model_name": "gpt-3.5-turbo", # openai model name 
 		"litellm_params": { # params for litellm completion/embedding call 
@ -89,21 +92,49 @@ def test_multiple_deployments():

 	router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore

-	completions = [] 
-	with ThreadPoolExecutor(max_workers=100) as executor:
+	results = [] 
+	with ThreadPoolExecutor(max_workers=10) as executor:
 		kwargs = {
 			"model": "gpt-3.5-turbo",
-			"messages": [{"role": "user", "content": "Hey, how's it going?"}]
+			"messages": [{"role": "user", "content": """Context:
+
+In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
+
+Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
+
+In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs. 
+
+Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
+
+The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
+
+---
+Question: 
+
+Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
 		}
-		for _ in range(20):
-			future = executor.submit(router.completion, **kwargs) # type: ignore
-			completions.append(future)
+		for _ in range(10):
+			future = executor.submit(router.completion, **kwargs)
+			futures[future] = future

 		# Retrieve the results from the futures
-	results = [future.result() for future in completions]
+		while futures:
+			done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
+			for future in done:
+				try:
+					result = future.result()
+					print(f"result: {result}")
+					results.append(result)
+					del futures[future]
+				except Exception as e:
+					print(f"Exception: {e}; traceback: {traceback.format_exc()}")
+					del futures[future]  # remove the done future

+		# Check results
 		print(results)

+
+test_multiple_deployments()
 ### FUNCTION CALLING 

 def test_function_calling(): 
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.13.5"
+version = "0.13.6"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@ -26,7 +26,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "0.13.5"
+version = "0.13.6"
 version_files = [
    "pyproject.toml:^version"
 ]