diff --git a/litellm/router.py b/litellm/router.py index d645e082fa..2e078f0761 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -301,119 +301,119 @@ class Router: data[k] = v return await litellm.aembedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs}) - def deployment_callback( - self, - kwargs, # kwargs to completion - completion_response, # response from completion - start_time, end_time # start/end time - ): - """ - Function LiteLLM submits a callback to after a successful - completion. Purpose of this is ti update TPM/RPM usage per model - """ - model_name = kwargs.get('model', None) # i.e. gpt35turbo - custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None) # i.e. azure - if custom_llm_provider: - model_name = f"{custom_llm_provider}/{model_name}" - total_tokens = completion_response['usage']['total_tokens'] - self._set_deployment_usage(model_name, total_tokens) + # def deployment_callback( + # self, + # kwargs, # kwargs to completion + # completion_response, # response from completion + # start_time, end_time # start/end time + # ): + # """ + # Function LiteLLM submits a callback to after a successful + # completion. Purpose of this is ti update TPM/RPM usage per model + # """ + # model_name = kwargs.get('model', None) # i.e. gpt35turbo + # custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None) # i.e. azure + # if custom_llm_provider: + # model_name = f"{custom_llm_provider}/{model_name}" + # total_tokens = completion_response['usage']['total_tokens'] + # self._set_deployment_usage(model_name, total_tokens) - def get_available_deployment(self, - model: str, - messages: Optional[List[Dict[str, str]]] = None, - input: Optional[Union[str, List]] = None): - """ - Returns a deployment with the lowest TPM/RPM usage. - """ - # get list of potential deployments - potential_deployments = [] - for item in self.model_list: - if item["model_name"] == model: - potential_deployments.append(item) + # def get_available_deployment(self, + # model: str, + # messages: Optional[List[Dict[str, str]]] = None, + # input: Optional[Union[str, List]] = None): + # """ + # Returns a deployment with the lowest TPM/RPM usage. + # """ + # # get list of potential deployments + # potential_deployments = [] + # for item in self.model_list: + # if item["model_name"] == model: + # potential_deployments.append(item) - # set first model as current model to calculate token count - deployment = potential_deployments[0] + # # set first model as current model to calculate token count + # deployment = potential_deployments[0] - # get encoding - token_count = 0 - if messages is not None: - token_count = litellm.token_counter(model=deployment["model_name"], messages=messages) - elif input is not None: - if isinstance(input, List): - input_text = "".join(text for text in input) - else: - input_text = input - token_count = litellm.token_counter(model=deployment["model_name"], text=input_text) + # # get encoding + # token_count = 0 + # if messages is not None: + # token_count = litellm.token_counter(model=deployment["model_name"], messages=messages) + # elif input is not None: + # if isinstance(input, List): + # input_text = "".join(text for text in input) + # else: + # input_text = input + # token_count = litellm.token_counter(model=deployment["model_name"], text=input_text) - # ----------------------- - # Find lowest used model - # ---------------------- - lowest_tpm = float("inf") - deployment = None + # # ----------------------- + # # Find lowest used model + # # ---------------------- + # lowest_tpm = float("inf") + # deployment = None - # Go through all the models to get tpm, rpm - for item in potential_deployments: - item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"]) + # # Go through all the models to get tpm, rpm + # for item in potential_deployments: + # item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"]) - if item_tpm == 0: - return item - elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]: - continue - elif item_tpm < lowest_tpm: - lowest_tpm = item_tpm - deployment = item + # if item_tpm == 0: + # return item + # elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]: + # continue + # elif item_tpm < lowest_tpm: + # lowest_tpm = item_tpm + # deployment = item - # if none, raise exception - if deployment is None: - raise ValueError("No models available.") + # # if none, raise exception + # if deployment is None: + # raise ValueError("No models available.") - # return model - return deployment + # # return model + # return deployment - def _get_deployment_usage( - self, - deployment_name: str - ): - # ------------ - # Setup values - # ------------ - current_minute = datetime.now().strftime("%H-%M") - tpm_key = f'{deployment_name}:tpm:{current_minute}' - rpm_key = f'{deployment_name}:rpm:{current_minute}' + # def _get_deployment_usage( + # self, + # deployment_name: str + # ): + # # ------------ + # # Setup values + # # ------------ + # current_minute = datetime.now().strftime("%H-%M") + # tpm_key = f'{deployment_name}:tpm:{current_minute}' + # rpm_key = f'{deployment_name}:rpm:{current_minute}' - # ------------ - # Return usage - # ------------ - tpm = self.cache.get_cache(cache_key=tpm_key) or 0 - rpm = self.cache.get_cache(cache_key=rpm_key) or 0 + # # ------------ + # # Return usage + # # ------------ + # tpm = self.cache.get_cache(cache_key=tpm_key) or 0 + # rpm = self.cache.get_cache(cache_key=rpm_key) or 0 - return int(tpm), int(rpm) + # return int(tpm), int(rpm) - def increment(self, key: str, increment_value: int): - # get value - cached_value = self.cache.get_cache(cache_key=key) - # update value - try: - cached_value = cached_value + increment_value - except: - cached_value = increment_value - # save updated value - self.cache.add_cache(result=cached_value, cache_key=key, ttl=self.default_cache_time_seconds) + # def increment(self, key: str, increment_value: int): + # # get value + # cached_value = self.cache.get_cache(cache_key=key) + # # update value + # try: + # cached_value = cached_value + increment_value + # except: + # cached_value = increment_value + # # save updated value + # self.cache.add_cache(result=cached_value, cache_key=key, ttl=self.default_cache_time_seconds) - def _set_deployment_usage( - self, - model_name: str, - total_tokens: int - ): - # ------------ - # Setup values - # ------------ - current_minute = datetime.now().strftime("%H-%M") - tpm_key = f'{model_name}:tpm:{current_minute}' - rpm_key = f'{model_name}:rpm:{current_minute}' + # def _set_deployment_usage( + # self, + # model_name: str, + # total_tokens: int + # ): + # # ------------ + # # Setup values + # # ------------ + # current_minute = datetime.now().strftime("%H-%M") + # tpm_key = f'{model_name}:tpm:{current_minute}' + # rpm_key = f'{model_name}:rpm:{current_minute}' - # ------------ - # Update usage - # ------------ - self.increment(tpm_key, total_tokens) - self.increment(rpm_key, 1) \ No newline at end of file + # # ------------ + # # Update usage + # # ------------ + # self.increment(tpm_key, total_tokens) + # self.increment(rpm_key, 1) \ No newline at end of file