From ae6f91a56d407330d8496f4c093ef5a8c1271a7c Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Mon, 3 Mar 2025 22:57:08 -0800 Subject: [PATCH] =?UTF-8?q?fix(route=5Fllm=5Frequest.py):=20move=20to=20us?= =?UTF-8?q?ing=20common=20router,=20even=20for=20clie=E2=80=A6=20(#8966)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(route_llm_request.py): move to using common router, even for client-side credentials ensures fallbacks / cooldown logic still works * test(test_route_llm_request.py): add unit test for route request * feat(router.py): generate unique model id when clientside credential passed in Prevents cooldowns for api key 1 from impacting api key 2 * test(test_router.py): update testing to ensure original litellm params not mutated * fix(router.py): upsert clientside call into llm router model list enables cooldown logic to work accurately * fix: fix linting error * test(test_router_utils.py): add direct test for new util on router --- litellm/proxy/_new_secret_config.yaml | 39 +++---------- litellm/proxy/route_llm_request.py | 2 +- litellm/router.py | 57 +++++++++++++++++-- .../clientside_credential_handler.py | 37 ++++++++++++ litellm/router_utils/cooldown_handlers.py | 16 ++++++ tests/litellm/proxy/test_route_llm_request.py | 46 +++++++++++++++ tests/local_testing/test_router.py | 43 ++++++++++++++ tests/local_testing/test_router_cooldowns.py | 47 +++++++++++++++ tests/local_testing/test_router_utils.py | 22 +++++++ 9 files changed, 273 insertions(+), 36 deletions(-) create mode 100644 litellm/router_utils/clientside_credential_handler.py create mode 100644 tests/litellm/proxy/test_route_llm_request.py diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 649d5a25d0..ba355c0dc4 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,34 +1,11 @@ model_list: - - model_name: claude-3.7 + - model_name: openai-gpt-4o-mini-2024-07-18 litellm_params: - model: openai/gpt-3.5-turbo - api_key: os.environ/OPENAI_API_KEY - api_base: http://0.0.0.0:8090 - - model_name: deepseek-r1 - litellm_params: - model: bedrock/deepseek_r1/arn:aws:bedrock:us-west-2:888602223428:imported-model/bnnr6463ejgf - - model_name: deepseek-r1-api - litellm_params: - model: deepseek/deepseek-reasoner - - model_name: cohere.embed-english-v3 - litellm_params: - model: bedrock/cohere.embed-english-v3 - api_key: os.environ/COHERE_API_KEY - - model_name: bedrock-claude-3-7 - litellm_params: - model: bedrock/invoke/us.anthropic.claude-3-7-sonnet-20250219-v1:0 - - model_name: bedrock-claude-3-5-sonnet - litellm_params: - model: bedrock/invoke/us.anthropic.claude-3-5-sonnet-20240620-v1:0 - - model_name: bedrock-nova - litellm_params: - model: bedrock/us.amazon.nova-pro-v1:0 - - model_name: gpt-4o - litellm_params: - model: openai/gpt-4o + model: openai/gpt-4o-mini-2024-07-18 + configurable_clientside_auth_params: ["api_key"] + api_key: "my-bad-key" + # - model_name: openai-fallback-model + # litellm_params: + # model: openai/gpt-3.5-turbo + -litellm_settings: - cache: true - cache_params: # set cache params for redis - type: redis - namespace: "litellm.caching" diff --git a/litellm/proxy/route_llm_request.py b/litellm/proxy/route_llm_request.py index 6102a26b23..6683a18b9a 100644 --- a/litellm/proxy/route_llm_request.py +++ b/litellm/proxy/route_llm_request.py @@ -53,7 +53,7 @@ async def route_request( """ router_model_names = llm_router.model_names if llm_router is not None else [] if "api_key" in data or "api_base" in data: - return getattr(litellm, f"{route_type}")(**data) + return getattr(llm_router, f"{route_type}")(**data) elif "user_config" in data: router_config = data.pop("user_config") diff --git a/litellm/router.py b/litellm/router.py index 4e93baf328..880950b227 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -67,6 +67,10 @@ from litellm.router_utils.batch_utils import ( replace_model_in_jsonl, ) from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient +from litellm.router_utils.clientside_credential_handler import ( + get_dynamic_litellm_params, + is_clientside_credential, +) from litellm.router_utils.cooldown_cache import CooldownCache from litellm.router_utils.cooldown_handlers import ( DEFAULT_COOLDOWN_TIME_SECONDS, @@ -1067,20 +1071,61 @@ class Router: elif k == "metadata": kwargs[k].update(v) + def _handle_clientside_credential( + self, deployment: dict, kwargs: dict + ) -> Deployment: + """ + Handle clientside credential + """ + model_info = deployment.get("model_info", {}).copy() + litellm_params = deployment["litellm_params"].copy() + dynamic_litellm_params = get_dynamic_litellm_params( + litellm_params=litellm_params, request_kwargs=kwargs + ) + metadata = kwargs.get("metadata", {}) + model_group = cast(str, metadata.get("model_group")) + _model_id = self._generate_model_id( + model_group=model_group, litellm_params=dynamic_litellm_params + ) + original_model_id = model_info.get("id") + model_info["id"] = _model_id + model_info["original_model_id"] = original_model_id + deployment_pydantic_obj = Deployment( + model_name=model_group, + litellm_params=LiteLLM_Params(**dynamic_litellm_params), + model_info=model_info, + ) + self.upsert_deployment( + deployment=deployment_pydantic_obj + ) # add new deployment to router + return deployment_pydantic_obj + def _update_kwargs_with_deployment(self, deployment: dict, kwargs: dict) -> None: """ 2 jobs: - Adds selected deployment, model_info and api_base to kwargs["metadata"] (used for logging) - Adds default litellm params to kwargs, if set. """ + model_info = deployment.get("model_info", {}).copy() + deployment_model_name = deployment["litellm_params"]["model"] + deployment_api_base = deployment["litellm_params"].get("api_base") + if is_clientside_credential(request_kwargs=kwargs): + deployment_pydantic_obj = self._handle_clientside_credential( + deployment=deployment, kwargs=kwargs + ) + model_info = deployment_pydantic_obj.model_info.model_dump() + deployment_model_name = deployment_pydantic_obj.litellm_params.model + deployment_api_base = deployment_pydantic_obj.litellm_params.api_base + kwargs.setdefault("metadata", {}).update( { - "deployment": deployment["litellm_params"]["model"], - "model_info": deployment.get("model_info", {}), - "api_base": deployment.get("litellm_params", {}).get("api_base"), + "deployment": deployment_model_name, + "model_info": model_info, + "api_base": deployment_api_base, } ) - kwargs["model_info"] = deployment.get("model_info", {}) + kwargs["model_info"] = model_info + kwargs["timeout"] = self._get_timeout( kwargs=kwargs, data=deployment["litellm_params"] ) @@ -3601,6 +3646,7 @@ class Router: - True if the deployment should be put in cooldown - False if the deployment should not be put in cooldown """ + verbose_router_logger.debug("Router: Entering 'deployment_callback_on_failure'") try: exception = kwargs.get("exception", None) exception_status = getattr(exception, "status_code", "") @@ -3642,6 +3688,9 @@ class Router: return result else: + verbose_router_logger.debug( + "Router: Exiting 'deployment_callback_on_failure' without cooldown. No model_info found." + ) return False except Exception as e: diff --git a/litellm/router_utils/clientside_credential_handler.py b/litellm/router_utils/clientside_credential_handler.py new file mode 100644 index 0000000000..c98f614335 --- /dev/null +++ b/litellm/router_utils/clientside_credential_handler.py @@ -0,0 +1,37 @@ +""" +Utils for handling clientside credentials + +Supported clientside credentials: +- api_key +- api_base +- base_url + +If given, generate a unique model_id for the deployment. + +Ensures cooldowns are applied correctly. +""" + +clientside_credential_keys = ["api_key", "api_base", "base_url"] + + +def is_clientside_credential(request_kwargs: dict) -> bool: + """ + Check if the credential is a clientside credential. + """ + return any(key in request_kwargs for key in clientside_credential_keys) + + +def get_dynamic_litellm_params(litellm_params: dict, request_kwargs: dict) -> dict: + """ + Generate a unique model_id for the deployment. + + Returns + - litellm_params: dict + + for generating a unique model_id. + """ + # update litellm_params with clientside credentials + for key in clientside_credential_keys: + if key in request_kwargs: + litellm_params[key] = request_kwargs[key] + return litellm_params diff --git a/litellm/router_utils/cooldown_handlers.py b/litellm/router_utils/cooldown_handlers.py index 64a6d56a77..52babc27f2 100644 --- a/litellm/router_utils/cooldown_handlers.py +++ b/litellm/router_utils/cooldown_handlers.py @@ -112,12 +112,19 @@ def _should_run_cooldown_logic( deployment is None or litellm_router_instance.get_model_group(id=deployment) is None ): + verbose_router_logger.debug( + "Should Not Run Cooldown Logic: deployment id is none or model group can't be found." + ) return False if litellm_router_instance.disable_cooldowns: + verbose_router_logger.debug( + "Should Not Run Cooldown Logic: disable_cooldowns is True" + ) return False if deployment is None: + verbose_router_logger.debug("Should Not Run Cooldown Logic: deployment is None") return False if not _is_cooldown_required( @@ -126,9 +133,15 @@ def _should_run_cooldown_logic( exception_status=exception_status, exception_str=str(original_exception), ): + verbose_router_logger.debug( + "Should Not Run Cooldown Logic: _is_cooldown_required returned False" + ) return False if deployment in litellm_router_instance.provider_default_deployment_ids: + verbose_router_logger.debug( + "Should Not Run Cooldown Logic: deployment is in provider_default_deployment_ids" + ) return False return True @@ -244,6 +257,8 @@ def _set_cooldown_deployments( - True if the deployment should be put in cooldown - False if the deployment should not be put in cooldown """ + verbose_router_logger.debug("checks 'should_run_cooldown_logic'") + if ( _should_run_cooldown_logic( litellm_router_instance, deployment, exception_status, original_exception @@ -251,6 +266,7 @@ def _set_cooldown_deployments( is False or deployment is None ): + verbose_router_logger.debug("should_run_cooldown_logic returned False") return False exception_status_int = cast_exception_status_to_int(exception_status) diff --git a/tests/litellm/proxy/test_route_llm_request.py b/tests/litellm/proxy/test_route_llm_request.py new file mode 100644 index 0000000000..6e8fedf0cb --- /dev/null +++ b/tests/litellm/proxy/test_route_llm_request.py @@ -0,0 +1,46 @@ +import json +import os +import sys + +import pytest +from fastapi.testclient import TestClient + +sys.path.insert( + 0, os.path.abspath("../../..") +) # Adds the parent directory to the system path + + +from unittest.mock import MagicMock + +from litellm.proxy.route_llm_request import route_request + + +@pytest.mark.parametrize( + "route_type", + [ + "atext_completion", + "acompletion", + "aembedding", + "aimage_generation", + "aspeech", + "atranscription", + "amoderation", + "arerank", + ], +) +@pytest.mark.asyncio +async def test_route_request_dynamic_credentials(route_type): + data = { + "model": "openai/gpt-4o-mini-2024-07-18", + "api_key": "my-bad-key", + "api_base": "https://api.openai.com/v1 ", + } + llm_router = MagicMock() + # Ensure that the dynamic method exists on the llm_router mock. + getattr(llm_router, route_type).return_value = "fake_response" + + response = await route_request(data, llm_router, None, route_type) + # Optionally verify the response if needed: + assert response == "fake_response" + # Now assert that the dynamic method was called once with the expected kwargs. + getattr(llm_router, route_type).assert_called_once_with(**data) diff --git a/tests/local_testing/test_router.py b/tests/local_testing/test_router.py index 698f779b78..0f690a94f2 100644 --- a/tests/local_testing/test_router.py +++ b/tests/local_testing/test_router.py @@ -2777,3 +2777,46 @@ def test_router_get_model_list_from_model_alias(): model_name="gpt-3.5-turbo" ) assert len(model_alias_list) == 0 + + +def test_router_dynamic_credentials(): + """ + Assert model id for dynamic api key 1 != model id for dynamic api key 2 + """ + original_model_id = "123" + original_api_key = "my-bad-key" + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "openai/gpt-3.5-turbo", + "api_key": original_api_key, + "mock_response": "fake_response", + }, + "model_info": {"id": original_model_id}, + } + ] + ) + + deployment = router.get_deployment(model_id=original_model_id) + assert deployment is not None + assert deployment.litellm_params.api_key == original_api_key + + response = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "hi"}], + api_key="my-bad-key-2", + ) + + response_2 = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "hi"}], + api_key="my-bad-key-3", + ) + + assert response_2._hidden_params["model_id"] != response._hidden_params["model_id"] + + deployment = router.get_deployment(model_id=original_model_id) + assert deployment is not None + assert deployment.litellm_params.api_key == original_api_key diff --git a/tests/local_testing/test_router_cooldowns.py b/tests/local_testing/test_router_cooldowns.py index 38d4133afd..80ceb33c01 100644 --- a/tests/local_testing/test_router_cooldowns.py +++ b/tests/local_testing/test_router_cooldowns.py @@ -692,3 +692,50 @@ def test_router_fallbacks_with_cooldowns_and_model_id(): model="gpt-3.5-turbo", messages=[{"role": "user", "content": "hi"}], ) + + +@pytest.mark.asyncio() +async def test_router_fallbacks_with_cooldowns_and_dynamic_credentials(): + """ + Ensure cooldown on credential 1 does not affect credential 2 + """ + from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments + + litellm._turn_on_debug() + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": {"model": "gpt-3.5-turbo", "rpm": 1}, + "model_info": { + "id": "123", + }, + } + ] + ) + + ## trigger ratelimit + try: + await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "hi"}], + api_key="my-bad-key-1", + mock_response="litellm.RateLimitError", + ) + pytest.fail("Expected RateLimitError") + except litellm.RateLimitError: + pass + + await asyncio.sleep(1) + + cooldown_list = await _async_get_cooldown_deployments( + litellm_router_instance=router, parent_otel_span=None + ) + print("cooldown_list: ", cooldown_list) + assert len(cooldown_list) == 1 + + await router.acompletion( + model="gpt-3.5-turbo", + api_key=os.getenv("OPENAI_API_KEY"), + messages=[{"role": "user", "content": "hi"}], + ) diff --git a/tests/local_testing/test_router_utils.py b/tests/local_testing/test_router_utils.py index 7de9707579..7c2bbdc2a1 100644 --- a/tests/local_testing/test_router_utils.py +++ b/tests/local_testing/test_router_utils.py @@ -396,3 +396,25 @@ def test_router_redis_cache(): router._update_redis_cache(cache=redis_cache) assert router.cache.redis_cache == redis_cache + + +def test_router_handle_clientside_credential(): + deployment = { + "model_name": "gemini/*", + "litellm_params": {"model": "gemini/*"}, + "model_info": { + "id": "1", + }, + } + router = Router(model_list=[deployment]) + + new_deployment = router._handle_clientside_credential( + deployment=deployment, + kwargs={ + "api_key": "123", + "metadata": {"model_group": "gemini/gemini-1.5-flash"}, + }, + ) + + assert new_deployment.litellm_params.api_key == "123" + assert len(router.get_model_list()) == 2