fix(router.py): log when a call is retried or fallback happens

2025-04-25 18:54:30 +00:00 · 2023-12-05 21:29:51 -08:00 · 2023-12-05 21:29:51 -08:00 · 7b83238cb5
commit 7b83238cb5
parent 642c62f7b7
2 changed files with 126 additions and 39 deletions
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -10,6 +10,30 @@ sys.path.insert(

 import litellm
 from litellm import Router
+from litellm.integrations.custom_logger import CustomLogger
+
+class MyCustomHandler(CustomLogger):
+    success: bool = False
+    failure: bool = False
+    previous_models: int = 0
+
+    def log_pre_api_call(self, model, messages, kwargs): 
+        print(f"Pre-API Call")
+    
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        print(f"Post-API Call")
+    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+        
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}")
+        self.previous_models += len(kwargs["litellm_params"]["metadata"]["previous_models"]) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
+        print(f"self.previous_models: {self.previous_models}")
+        print(f"On Success")
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Failure")

 model_list = [
    { # list of model deployments 
@ -27,7 +51,7 @@ model_list = [
 		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name 
 		"litellm_params": { # params for litellm completion/embedding call 
 			"model": "azure/chatgpt-v-2", 
-			"api_key": "bad-key",
+			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
@ -67,52 +91,74 @@ model_list = [



-router = Router(model_list=model_list, 
-                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
-                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
-                set_verbose=True)
-
 kwargs = {"model": "azure/gpt-3.5-turbo", "messages": [{"role": "user", "content":"Hey, how's it going?"}]}

 def test_sync_fallbacks():        
    try:
        litellm.set_verbose = True
+        customHandler = MyCustomHandler()
+        litellm.callbacks = [customHandler]
+        router = Router(model_list=model_list, 
+                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
+                set_verbose=False)
        response = router.completion(**kwargs)
        print(f"response: {response}")
-        router.flush_cache()
+        time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
+        assert customHandler.previous_models == 1 # 0 retries, 1 fallback
+        router.reset()
    except Exception as e:
        print(e)
 # test_sync_fallbacks() 

 def test_async_fallbacks(): 
    litellm.set_verbose = False
+    router = Router(model_list=model_list, 
+                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
+                set_verbose=False)
    async def test_get_response():
+        customHandler = MyCustomHandler()
+        litellm.callbacks = [customHandler]
        user_message = "Hello, how are you?"
        messages = [{"content": user_message, "role": "user"}]
        try:
            response = await router.acompletion(**kwargs)
-            # response = await response
-            print(f"response: {response}")
-            router.flush_cache()
+            print(f"customHandler.previous_models: {customHandler.previous_models}")
+            time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
+            assert customHandler.previous_models == 1 # 0 retries, 1 fallback
+            router.reset()
        except litellm.Timeout as e: 
            pass
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
-
+        finally:
+            router.reset()
    asyncio.run(test_get_response())

 # test_async_fallbacks()

-def test_sync_context_window_fallbacks(): 
-    try:
-        sample_text = "Say error 50 times" * 10000
-        kwargs["model"] = "azure/gpt-3.5-turbo-context-fallback"
-        kwargs["messages"] = [{"role": "user", "content": sample_text}]
-        response = router.completion(**kwargs)
-        print(f"response: {response}")
-        router.reset()
-    except Exception as e:
-        print(e)
+## COMMENTING OUT as the context size exceeds both gpt-3.5-turbo and gpt-3.5-turbo-16k, need a better message here
+# def test_sync_context_window_fallbacks(): 
+#     try:
+#         customHandler = MyCustomHandler()
+#         litellm.callbacks = [customHandler]
+#         sample_text = "Say error 50 times" * 10000
+#         kwargs["model"] = "azure/gpt-3.5-turbo-context-fallback"
+#         kwargs["messages"] = [{"role": "user", "content": sample_text}]
+#         router = Router(model_list=model_list, 
+#                 fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+#                 context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
+#                 set_verbose=False)
+#         response = router.completion(**kwargs)
+#         print(f"response: {response}")
+#         time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
+#         assert customHandler.previous_models == 1 # 0 retries, 1 fallback
+#         router.reset()
+#     except Exception as e:
+#         print(f"An exception occurred - {e}")
+#     finally: 
+#         router.reset()

 # test_sync_context_window_fallbacks()

@ -121,6 +167,8 @@ def test_dynamic_fallbacks_sync():
    Allow setting the fallback in the router.completion() call. 
    """
    try:
+          customHandler = MyCustomHandler()
+          litellm.callbacks = [customHandler]
          router = Router(model_list=model_list, set_verbose=True)
          kwargs = {}
          kwargs["model"] = "azure/gpt-3.5-turbo"
@ -128,6 +176,8 @@ def test_dynamic_fallbacks_sync():
          kwargs["fallbacks"] = [{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}]
          response = router.completion(**kwargs)
          print(f"response: {response}")
+          time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
+          assert customHandler.previous_models == 1 # 0 retries, 1 fallback
          router.reset()
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@ -140,6 +190,8 @@ def test_dynamic_fallbacks_async():
    """
    async def test_get_response():
         try: 
+            customHandler = MyCustomHandler()
+            litellm.callbacks = [customHandler]
            router = Router(model_list=model_list, set_verbose=True)
            kwargs = {}
            kwargs["model"] = "azure/gpt-3.5-turbo"
@ -147,6 +199,8 @@ def test_dynamic_fallbacks_async():
            kwargs["fallbacks"] = [{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}]
            response = await router.acompletion(**kwargs)
            print(f"response: {response}")
+            time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
+            assert customHandler.previous_models == 1 # 0 retries, 1 fallback
            router.reset()
         except Exception as e:
              pytest.fail(f"An exception occurred - {e}")