diff --git a/litellm/main.py b/litellm/main.py index 454f7f716..569418eca 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -387,6 +387,19 @@ def mock_completion( - If 'stream' is True, it returns a response that mimics the behavior of a streaming completion. """ try: + ## LOGGING + logging.pre_call( + input=messages, + api_key="mock-key", + ) + if isinstance(mock_response, Exception): + raise litellm.APIError( + status_code=500, # type: ignore + message=str(mock_response), + llm_provider="openai", # type: ignore + model=model, # type: ignore + request=httpx.Request(method="POST", url="https://api.openai.com/v1/"), + ) model_response = ModelResponse(stream=stream) if stream is True: # don't try to access stream object, diff --git a/litellm/router.py b/litellm/router.py index 4f1d6b96e..bcf2d2cb6 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1418,13 +1418,6 @@ class Router: traceback.print_exc() raise original_exception - async def _async_router_should_retry( - self, e: Exception, remaining_retries: int, num_retries: int - ): - """ - Calculate back-off, then retry - """ - async def async_function_with_retries(self, *args, **kwargs): verbose_router_logger.debug( f"Inside async function with retries: args - {args}; kwargs - {kwargs}" @@ -1674,6 +1667,7 @@ class Router: context_window_fallbacks = kwargs.pop( "context_window_fallbacks", self.context_window_fallbacks ) + try: # if the function call is successful, no exception will be raised and we'll break out of the loop response = original_function(*args, **kwargs) @@ -1751,10 +1745,11 @@ class Router: ) # i.e. azure metadata = kwargs.get("litellm_params", {}).get("metadata", None) _model_info = kwargs.get("litellm_params", {}).get("model_info", {}) + if isinstance(_model_info, dict): deployment_id = _model_info.get("id", None) self._set_cooldown_deployments( - deployment_id + exception_status=exception_status, deployment=deployment_id ) # setting deployment_id in cooldown deployments if custom_llm_provider: model_name = f"{custom_llm_provider}/{model_name}" @@ -1814,9 +1809,15 @@ class Router: key=rpm_key, value=request_count, local_only=True ) # don't change existing ttl - def _set_cooldown_deployments(self, deployment: Optional[str] = None): + def _set_cooldown_deployments( + self, exception_status: Union[str, int], deployment: Optional[str] = None + ): """ Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute + + or + + the exception is not one that should be immediately retried (e.g. 401) """ if deployment is None: return @@ -1833,7 +1834,20 @@ class Router: f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" ) cooldown_time = self.cooldown_time or 1 - if updated_fails > self.allowed_fails: + + if isinstance(exception_status, str): + try: + exception_status = int(exception_status) + except Exception as e: + verbose_router_logger.debug( + "Unable to cast exception status to int {}. Defaulting to status=500.".format( + exception_status + ) + ) + exception_status = 500 + _should_retry = litellm._should_retry(status_code=exception_status) + + if updated_fails > self.allowed_fails or _should_retry == False: # get the current cooldown list for that minute cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cached_value = self.cache.get_cache(key=cooldown_key) diff --git a/litellm/tests/conftest.py b/litellm/tests/conftest.py index 4cd277b31..8c2ce781f 100644 --- a/litellm/tests/conftest.py +++ b/litellm/tests/conftest.py @@ -19,6 +19,7 @@ def setup_and_teardown(): 0, os.path.abspath("../..") ) # Adds the project directory to the system path import litellm + from litellm import Router importlib.reload(litellm) import asyncio diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py index 8c6b9fa01..2d277d749 100644 --- a/litellm/tests/test_router.py +++ b/litellm/tests/test_router.py @@ -1154,6 +1154,7 @@ def test_consistent_model_id(): assert id1 == id2 +@pytest.mark.skip(reason="local test") def test_reading_keys_os_environ(): import openai @@ -1253,6 +1254,7 @@ def test_reading_keys_os_environ(): # test_reading_keys_os_environ() +@pytest.mark.skip(reason="local test") def test_reading_openai_keys_os_environ(): import openai diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index 364319929..a4110518b 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger): def log_pre_api_call(self, model, messages, kwargs): print(f"Pre-API Call") print( - f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}" + f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}" ) - self.previous_models += len( - kwargs["litellm_params"]["metadata"]["previous_models"] + self.previous_models = len( + kwargs["litellm_params"]["metadata"].get("previous_models", []) ) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": }]} print(f"self.previous_models: {self.previous_models}") @@ -140,7 +140,7 @@ def test_sync_fallbacks(): @pytest.mark.asyncio async def test_async_fallbacks(): - litellm.set_verbose = False + litellm.set_verbose = True model_list = [ { # list of model deployments "model_name": "azure/gpt-3.5-turbo", # openai model name diff --git a/litellm/tests/test_router_retries.py b/litellm/tests/test_router_retries.py new file mode 100644 index 000000000..3ed08dfd9 --- /dev/null +++ b/litellm/tests/test_router_retries.py @@ -0,0 +1,121 @@ +#### What this tests #### +# This tests calling router with fallback models + +import sys, os, time +import traceback, asyncio +import pytest + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path + +import litellm +from litellm import Router +from litellm.integrations.custom_logger import CustomLogger + + +class MyCustomHandler(CustomLogger): + success: bool = False + failure: bool = False + previous_models: int = 0 + + def log_pre_api_call(self, model, messages, kwargs): + print(f"Pre-API Call") + print( + f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}" + ) + self.previous_models = len( + kwargs["litellm_params"]["metadata"].get("previous_models", []) + ) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": }]} + print(f"self.previous_models: {self.previous_models}") + + def log_post_api_call(self, kwargs, response_obj, start_time, end_time): + print( + f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}" + ) + + def log_stream_event(self, kwargs, response_obj, start_time, end_time): + print(f"On Stream") + + def async_log_stream_event(self, kwargs, response_obj, start_time, end_time): + print(f"On Stream") + + def log_success_event(self, kwargs, response_obj, start_time, end_time): + print(f"On Success") + + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + print(f"On Success") + + def log_failure_event(self, kwargs, response_obj, start_time, end_time): + print(f"On Failure") + + +""" +Test sync + async + +- Authorization Errors +- Random API Error +""" + + +@pytest.mark.parametrize("sync_mode", [True, False]) +@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"]) +@pytest.mark.asyncio +async def test_router_retries_errors(sync_mode, error_type): + """ + - Auth Error -> 0 retries + - API Error -> 2 retries + """ + + _api_key = ( + "bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY") + ) + print(f"_api_key: {_api_key}") + model_list = [ + { + "model_name": "azure/gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": _api_key, + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + "tpm": 240000, + "rpm": 1800, + }, + ] + + router = Router(model_list=model_list, allowed_fails=3) + + customHandler = MyCustomHandler() + litellm.callbacks = [customHandler] + user_message = "Hello, how are you?" + messages = [{"content": user_message, "role": "user"}] + + kwargs = { + "model": "azure/gpt-3.5-turbo", + "messages": messages, + "mock_response": ( + None + if error_type == "Authorization Error" + else Exception("Invalid Request") + ), + } + + try: + if sync_mode: + response = router.completion(**kwargs) + else: + response = await router.acompletion(**kwargs) + except Exception as e: + pass + + await asyncio.sleep( + 0.05 + ) # allow a delay as success_callbacks are on a separate thread + print(f"customHandler.previous_models: {customHandler.previous_models}") + + if error_type == "Authorization Error": + assert customHandler.previous_models == 0 # 0 retries + else: + assert customHandler.previous_models == 2 # 2 retries