diff --git a/litellm/router.py b/litellm/router.py index bcf2d2cb6..14efc2a56 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1512,31 +1512,6 @@ class Router: ## LOGGING kwargs = self.log_retry(kwargs=kwargs, e=e) remaining_retries = num_retries - current_attempt - # if "No models available" in str(e): - # timeout = litellm._calculate_retry_after( - # remaining_retries=remaining_retries, - # max_retries=num_retries, - # min_timeout=self.retry_after, - # ) - # await asyncio.sleep(timeout) - # elif ( - # hasattr(e, "status_code") - # and hasattr(e, "response") - # and litellm._should_retry(status_code=e.status_code) - # ): - # if hasattr(e.response, "headers"): - # timeout = litellm._calculate_retry_after( - # remaining_retries=remaining_retries, - # max_retries=num_retries, - # response_headers=e.response.headers, - # min_timeout=self.retry_after, - # ) - # else: - # timeout = litellm._calculate_retry_after( - # remaining_retries=remaining_retries, - # max_retries=num_retries, - # min_timeout=self.retry_after, - # ) _timeout = self._router_should_retry( e=original_exception, remaining_retries=remaining_retries, diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index a4110518b..7027050e1 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -127,7 +127,7 @@ def test_sync_fallbacks(): response = router.completion(**kwargs) print(f"response: {response}") time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 print("Passed ! Test router_fallbacks: test_sync_fallbacks()") router.reset() @@ -209,12 +209,13 @@ async def test_async_fallbacks(): user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] try: + kwargs["model"] = "azure/gpt-3.5-turbo" response = await router.acompletion(**kwargs) print(f"customHandler.previous_models: {customHandler.previous_models}") await asyncio.sleep( 0.05 ) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except litellm.Timeout as e: pass @@ -258,7 +259,6 @@ def test_sync_fallbacks_embeddings(): model_list=model_list, fallbacks=[{"bad-azure-embedding-model": ["good-azure-embedding-model"]}], set_verbose=False, - num_retries=0, ) customHandler = MyCustomHandler() litellm.callbacks = [customHandler] @@ -269,7 +269,7 @@ def test_sync_fallbacks_embeddings(): response = router.embedding(**kwargs) print(f"customHandler.previous_models: {customHandler.previous_models}") time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except litellm.Timeout as e: pass @@ -323,7 +323,7 @@ async def test_async_fallbacks_embeddings(): await asyncio.sleep( 0.05 ) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except litellm.Timeout as e: pass @@ -394,7 +394,7 @@ def test_dynamic_fallbacks_sync(): }, ] - router = Router(model_list=model_list, set_verbose=True, num_retries=0) + router = Router(model_list=model_list, set_verbose=True) kwargs = {} kwargs["model"] = "azure/gpt-3.5-turbo" kwargs["messages"] = [{"role": "user", "content": "Hey, how's it going?"}] @@ -402,7 +402,7 @@ def test_dynamic_fallbacks_sync(): response = router.completion(**kwargs) print(f"response: {response}") time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except Exception as e: pytest.fail(f"An exception occurred - {e}") @@ -488,7 +488,7 @@ async def test_dynamic_fallbacks_async(): await asyncio.sleep( 0.05 ) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except Exception as e: pytest.fail(f"An exception occurred - {e}") @@ -573,7 +573,7 @@ async def test_async_fallbacks_streaming(): await asyncio.sleep( 0.05 ) # allow a delay as success_callbacks are on a separate thread - assert customHandler.previous_models == 1 # 0 retries, 1 fallback + assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback router.reset() except litellm.Timeout as e: pass @@ -766,10 +766,10 @@ def test_usage_based_routing_fallbacks(): load_dotenv() # Constants for TPM and RPM allocation - AZURE_FAST_TPM = 3 - AZURE_BASIC_TPM = 4 - OPENAI_TPM = 400 - ANTHROPIC_TPM = 100000 + AZURE_FAST_RPM = 3 + AZURE_BASIC_RPM = 4 + OPENAI_RPM = 10 + ANTHROPIC_RPM = 100000 def get_azure_params(deployment_name: str): params = { @@ -798,22 +798,26 @@ def test_usage_based_routing_fallbacks(): { "model_name": "azure/gpt-4-fast", "litellm_params": get_azure_params("chatgpt-v-2"), - "tpm": AZURE_FAST_TPM, + "model_info": {"id": 1}, + "rpm": AZURE_FAST_RPM, }, { "model_name": "azure/gpt-4-basic", "litellm_params": get_azure_params("chatgpt-v-2"), - "tpm": AZURE_BASIC_TPM, + "model_info": {"id": 2}, + "rpm": AZURE_BASIC_RPM, }, { "model_name": "openai-gpt-4", "litellm_params": get_openai_params("gpt-3.5-turbo"), - "tpm": OPENAI_TPM, + "model_info": {"id": 3}, + "rpm": OPENAI_RPM, }, { "model_name": "anthropic-claude-instant-1.2", "litellm_params": get_anthropic_params("claude-instant-1.2"), - "tpm": ANTHROPIC_TPM, + "model_info": {"id": 4}, + "rpm": ANTHROPIC_RPM, }, ] # litellm.set_verbose=True @@ -844,10 +848,10 @@ def test_usage_based_routing_fallbacks(): mock_response="very nice to meet you", ) print("response: ", response) - print("response._hidden_params: ", response._hidden_params) + print(f"response._hidden_params: {response._hidden_params}") # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM - assert response._hidden_params["custom_llm_provider"] == "openai" + assert response._hidden_params["model_id"] == "1" # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2 for i in range(20): @@ -861,7 +865,7 @@ def test_usage_based_routing_fallbacks(): print("response._hidden_params: ", response._hidden_params) if i == 19: # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2 - assert response._hidden_params["custom_llm_provider"] == "anthropic" + assert response._hidden_params["model_id"] == "4" except Exception as e: pytest.fail(f"An exception occurred {e}")