fix(route_llm_request.py): move to using common router, even for clie… (#8966)

* fix(route_llm_request.py): move to using common router, even for client-side credentials ensures fallbacks / cooldown logic still works * test(test_route_llm_request.py): add unit test for route request * feat(router.py): generate unique model id when clientside credential passed in Prevents cooldowns for api key 1 from impacting api key 2 * test(test_router.py): update testing to ensure original litellm params not mutated * fix(router.py): upsert clientside call into llm router model list enables cooldown logic to work accurately * fix: fix linting error * test(test_router_utils.py): add direct test for new util on router
2025-04-26 19:24:27 +00:00 · 2025-03-03 22:57:08 -08:00 · 2025-03-03 22:57:08 -08:00 · ae6f91a56d
commit ae6f91a56d
parent bd2231400f
9 changed files with 273 additions and 36 deletions
--- a/tests/local_testing/test_router_cooldowns.py
+++ b/tests/local_testing/test_router_cooldowns.py
@ -692,3 +692,50 @@ def test_router_fallbacks_with_cooldowns_and_model_id():
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "hi"}],
    )
+
+
+@pytest.mark.asyncio()
+async def test_router_fallbacks_with_cooldowns_and_dynamic_credentials():
+    """
+    Ensure cooldown on credential 1 does not affect credential 2
+    """
+    from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
+
+    litellm._turn_on_debug()
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {"model": "gpt-3.5-turbo", "rpm": 1},
+                "model_info": {
+                    "id": "123",
+                },
+            }
+        ]
+    )
+
+    ## trigger ratelimit
+    try:
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "hi"}],
+            api_key="my-bad-key-1",
+            mock_response="litellm.RateLimitError",
+        )
+        pytest.fail("Expected RateLimitError")
+    except litellm.RateLimitError:
+        pass
+
+    await asyncio.sleep(1)
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    print("cooldown_list: ", cooldown_list)
+    assert len(cooldown_list) == 1
+
+    await router.acompletion(
+        model="gpt-3.5-turbo",
+        api_key=os.getenv("OPENAI_API_KEY"),
+        messages=[{"role": "user", "content": "hi"}],
+    )