mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
fix(route_llm_request.py): move to using common router, even for clie… (#8966)
* fix(route_llm_request.py): move to using common router, even for client-side credentials ensures fallbacks / cooldown logic still works * test(test_route_llm_request.py): add unit test for route request * feat(router.py): generate unique model id when clientside credential passed in Prevents cooldowns for api key 1 from impacting api key 2 * test(test_router.py): update testing to ensure original litellm params not mutated * fix(router.py): upsert clientside call into llm router model list enables cooldown logic to work accurately * fix: fix linting error * test(test_router_utils.py): add direct test for new util on router
This commit is contained in:
parent
bd2231400f
commit
ae6f91a56d
9 changed files with 273 additions and 36 deletions
|
@ -1,34 +1,11 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: claude-3.7
|
- model_name: openai-gpt-4o-mini-2024-07-18
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/gpt-3.5-turbo
|
model: openai/gpt-4o-mini-2024-07-18
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
configurable_clientside_auth_params: ["api_key"]
|
||||||
api_base: http://0.0.0.0:8090
|
api_key: "my-bad-key"
|
||||||
- model_name: deepseek-r1
|
# - model_name: openai-fallback-model
|
||||||
litellm_params:
|
# litellm_params:
|
||||||
model: bedrock/deepseek_r1/arn:aws:bedrock:us-west-2:888602223428:imported-model/bnnr6463ejgf
|
# model: openai/gpt-3.5-turbo
|
||||||
- model_name: deepseek-r1-api
|
|
||||||
litellm_params:
|
|
||||||
model: deepseek/deepseek-reasoner
|
|
||||||
- model_name: cohere.embed-english-v3
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/cohere.embed-english-v3
|
|
||||||
api_key: os.environ/COHERE_API_KEY
|
|
||||||
- model_name: bedrock-claude-3-7
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/invoke/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
|
||||||
- model_name: bedrock-claude-3-5-sonnet
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/invoke/us.anthropic.claude-3-5-sonnet-20240620-v1:0
|
|
||||||
- model_name: bedrock-nova
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/us.amazon.nova-pro-v1:0
|
|
||||||
- model_name: gpt-4o
|
|
||||||
litellm_params:
|
|
||||||
model: openai/gpt-4o
|
|
||||||
|
|
||||||
litellm_settings:
|
|
||||||
cache: true
|
|
||||||
cache_params: # set cache params for redis
|
|
||||||
type: redis
|
|
||||||
namespace: "litellm.caching"
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ async def route_request(
|
||||||
"""
|
"""
|
||||||
router_model_names = llm_router.model_names if llm_router is not None else []
|
router_model_names = llm_router.model_names if llm_router is not None else []
|
||||||
if "api_key" in data or "api_base" in data:
|
if "api_key" in data or "api_base" in data:
|
||||||
return getattr(litellm, f"{route_type}")(**data)
|
return getattr(llm_router, f"{route_type}")(**data)
|
||||||
|
|
||||||
elif "user_config" in data:
|
elif "user_config" in data:
|
||||||
router_config = data.pop("user_config")
|
router_config = data.pop("user_config")
|
||||||
|
|
|
@ -67,6 +67,10 @@ from litellm.router_utils.batch_utils import (
|
||||||
replace_model_in_jsonl,
|
replace_model_in_jsonl,
|
||||||
)
|
)
|
||||||
from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient
|
from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient
|
||||||
|
from litellm.router_utils.clientside_credential_handler import (
|
||||||
|
get_dynamic_litellm_params,
|
||||||
|
is_clientside_credential,
|
||||||
|
)
|
||||||
from litellm.router_utils.cooldown_cache import CooldownCache
|
from litellm.router_utils.cooldown_cache import CooldownCache
|
||||||
from litellm.router_utils.cooldown_handlers import (
|
from litellm.router_utils.cooldown_handlers import (
|
||||||
DEFAULT_COOLDOWN_TIME_SECONDS,
|
DEFAULT_COOLDOWN_TIME_SECONDS,
|
||||||
|
@ -1067,20 +1071,61 @@ class Router:
|
||||||
elif k == "metadata":
|
elif k == "metadata":
|
||||||
kwargs[k].update(v)
|
kwargs[k].update(v)
|
||||||
|
|
||||||
|
def _handle_clientside_credential(
|
||||||
|
self, deployment: dict, kwargs: dict
|
||||||
|
) -> Deployment:
|
||||||
|
"""
|
||||||
|
Handle clientside credential
|
||||||
|
"""
|
||||||
|
model_info = deployment.get("model_info", {}).copy()
|
||||||
|
litellm_params = deployment["litellm_params"].copy()
|
||||||
|
dynamic_litellm_params = get_dynamic_litellm_params(
|
||||||
|
litellm_params=litellm_params, request_kwargs=kwargs
|
||||||
|
)
|
||||||
|
metadata = kwargs.get("metadata", {})
|
||||||
|
model_group = cast(str, metadata.get("model_group"))
|
||||||
|
_model_id = self._generate_model_id(
|
||||||
|
model_group=model_group, litellm_params=dynamic_litellm_params
|
||||||
|
)
|
||||||
|
original_model_id = model_info.get("id")
|
||||||
|
model_info["id"] = _model_id
|
||||||
|
model_info["original_model_id"] = original_model_id
|
||||||
|
deployment_pydantic_obj = Deployment(
|
||||||
|
model_name=model_group,
|
||||||
|
litellm_params=LiteLLM_Params(**dynamic_litellm_params),
|
||||||
|
model_info=model_info,
|
||||||
|
)
|
||||||
|
self.upsert_deployment(
|
||||||
|
deployment=deployment_pydantic_obj
|
||||||
|
) # add new deployment to router
|
||||||
|
return deployment_pydantic_obj
|
||||||
|
|
||||||
def _update_kwargs_with_deployment(self, deployment: dict, kwargs: dict) -> None:
|
def _update_kwargs_with_deployment(self, deployment: dict, kwargs: dict) -> None:
|
||||||
"""
|
"""
|
||||||
2 jobs:
|
2 jobs:
|
||||||
- Adds selected deployment, model_info and api_base to kwargs["metadata"] (used for logging)
|
- Adds selected deployment, model_info and api_base to kwargs["metadata"] (used for logging)
|
||||||
- Adds default litellm params to kwargs, if set.
|
- Adds default litellm params to kwargs, if set.
|
||||||
"""
|
"""
|
||||||
|
model_info = deployment.get("model_info", {}).copy()
|
||||||
|
deployment_model_name = deployment["litellm_params"]["model"]
|
||||||
|
deployment_api_base = deployment["litellm_params"].get("api_base")
|
||||||
|
if is_clientside_credential(request_kwargs=kwargs):
|
||||||
|
deployment_pydantic_obj = self._handle_clientside_credential(
|
||||||
|
deployment=deployment, kwargs=kwargs
|
||||||
|
)
|
||||||
|
model_info = deployment_pydantic_obj.model_info.model_dump()
|
||||||
|
deployment_model_name = deployment_pydantic_obj.litellm_params.model
|
||||||
|
deployment_api_base = deployment_pydantic_obj.litellm_params.api_base
|
||||||
|
|
||||||
kwargs.setdefault("metadata", {}).update(
|
kwargs.setdefault("metadata", {}).update(
|
||||||
{
|
{
|
||||||
"deployment": deployment["litellm_params"]["model"],
|
"deployment": deployment_model_name,
|
||||||
"model_info": deployment.get("model_info", {}),
|
"model_info": model_info,
|
||||||
"api_base": deployment.get("litellm_params", {}).get("api_base"),
|
"api_base": deployment_api_base,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
kwargs["model_info"] = deployment.get("model_info", {})
|
kwargs["model_info"] = model_info
|
||||||
|
|
||||||
kwargs["timeout"] = self._get_timeout(
|
kwargs["timeout"] = self._get_timeout(
|
||||||
kwargs=kwargs, data=deployment["litellm_params"]
|
kwargs=kwargs, data=deployment["litellm_params"]
|
||||||
)
|
)
|
||||||
|
@ -3601,6 +3646,7 @@ class Router:
|
||||||
- True if the deployment should be put in cooldown
|
- True if the deployment should be put in cooldown
|
||||||
- False if the deployment should not be put in cooldown
|
- False if the deployment should not be put in cooldown
|
||||||
"""
|
"""
|
||||||
|
verbose_router_logger.debug("Router: Entering 'deployment_callback_on_failure'")
|
||||||
try:
|
try:
|
||||||
exception = kwargs.get("exception", None)
|
exception = kwargs.get("exception", None)
|
||||||
exception_status = getattr(exception, "status_code", "")
|
exception_status = getattr(exception, "status_code", "")
|
||||||
|
@ -3642,6 +3688,9 @@ class Router:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Router: Exiting 'deployment_callback_on_failure' without cooldown. No model_info found."
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
37
litellm/router_utils/clientside_credential_handler.py
Normal file
37
litellm/router_utils/clientside_credential_handler.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
"""
|
||||||
|
Utils for handling clientside credentials
|
||||||
|
|
||||||
|
Supported clientside credentials:
|
||||||
|
- api_key
|
||||||
|
- api_base
|
||||||
|
- base_url
|
||||||
|
|
||||||
|
If given, generate a unique model_id for the deployment.
|
||||||
|
|
||||||
|
Ensures cooldowns are applied correctly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
clientside_credential_keys = ["api_key", "api_base", "base_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_clientside_credential(request_kwargs: dict) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the credential is a clientside credential.
|
||||||
|
"""
|
||||||
|
return any(key in request_kwargs for key in clientside_credential_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dynamic_litellm_params(litellm_params: dict, request_kwargs: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Generate a unique model_id for the deployment.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
- litellm_params: dict
|
||||||
|
|
||||||
|
for generating a unique model_id.
|
||||||
|
"""
|
||||||
|
# update litellm_params with clientside credentials
|
||||||
|
for key in clientside_credential_keys:
|
||||||
|
if key in request_kwargs:
|
||||||
|
litellm_params[key] = request_kwargs[key]
|
||||||
|
return litellm_params
|
|
@ -112,12 +112,19 @@ def _should_run_cooldown_logic(
|
||||||
deployment is None
|
deployment is None
|
||||||
or litellm_router_instance.get_model_group(id=deployment) is None
|
or litellm_router_instance.get_model_group(id=deployment) is None
|
||||||
):
|
):
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Should Not Run Cooldown Logic: deployment id is none or model group can't be found."
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if litellm_router_instance.disable_cooldowns:
|
if litellm_router_instance.disable_cooldowns:
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Should Not Run Cooldown Logic: disable_cooldowns is True"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
|
verbose_router_logger.debug("Should Not Run Cooldown Logic: deployment is None")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not _is_cooldown_required(
|
if not _is_cooldown_required(
|
||||||
|
@ -126,9 +133,15 @@ def _should_run_cooldown_logic(
|
||||||
exception_status=exception_status,
|
exception_status=exception_status,
|
||||||
exception_str=str(original_exception),
|
exception_str=str(original_exception),
|
||||||
):
|
):
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Should Not Run Cooldown Logic: _is_cooldown_required returned False"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if deployment in litellm_router_instance.provider_default_deployment_ids:
|
if deployment in litellm_router_instance.provider_default_deployment_ids:
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Should Not Run Cooldown Logic: deployment is in provider_default_deployment_ids"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
@ -244,6 +257,8 @@ def _set_cooldown_deployments(
|
||||||
- True if the deployment should be put in cooldown
|
- True if the deployment should be put in cooldown
|
||||||
- False if the deployment should not be put in cooldown
|
- False if the deployment should not be put in cooldown
|
||||||
"""
|
"""
|
||||||
|
verbose_router_logger.debug("checks 'should_run_cooldown_logic'")
|
||||||
|
|
||||||
if (
|
if (
|
||||||
_should_run_cooldown_logic(
|
_should_run_cooldown_logic(
|
||||||
litellm_router_instance, deployment, exception_status, original_exception
|
litellm_router_instance, deployment, exception_status, original_exception
|
||||||
|
@ -251,6 +266,7 @@ def _set_cooldown_deployments(
|
||||||
is False
|
is False
|
||||||
or deployment is None
|
or deployment is None
|
||||||
):
|
):
|
||||||
|
verbose_router_logger.debug("should_run_cooldown_logic returned False")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
exception_status_int = cast_exception_status_to_int(exception_status)
|
exception_status_int = cast_exception_status_to_int(exception_status)
|
||||||
|
|
46
tests/litellm/proxy/test_route_llm_request.py
Normal file
46
tests/litellm/proxy/test_route_llm_request.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from litellm.proxy.route_llm_request import route_request
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"route_type",
|
||||||
|
[
|
||||||
|
"atext_completion",
|
||||||
|
"acompletion",
|
||||||
|
"aembedding",
|
||||||
|
"aimage_generation",
|
||||||
|
"aspeech",
|
||||||
|
"atranscription",
|
||||||
|
"amoderation",
|
||||||
|
"arerank",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_route_request_dynamic_credentials(route_type):
|
||||||
|
data = {
|
||||||
|
"model": "openai/gpt-4o-mini-2024-07-18",
|
||||||
|
"api_key": "my-bad-key",
|
||||||
|
"api_base": "https://api.openai.com/v1 ",
|
||||||
|
}
|
||||||
|
llm_router = MagicMock()
|
||||||
|
# Ensure that the dynamic method exists on the llm_router mock.
|
||||||
|
getattr(llm_router, route_type).return_value = "fake_response"
|
||||||
|
|
||||||
|
response = await route_request(data, llm_router, None, route_type)
|
||||||
|
# Optionally verify the response if needed:
|
||||||
|
assert response == "fake_response"
|
||||||
|
# Now assert that the dynamic method was called once with the expected kwargs.
|
||||||
|
getattr(llm_router, route_type).assert_called_once_with(**data)
|
|
@ -2777,3 +2777,46 @@ def test_router_get_model_list_from_model_alias():
|
||||||
model_name="gpt-3.5-turbo"
|
model_name="gpt-3.5-turbo"
|
||||||
)
|
)
|
||||||
assert len(model_alias_list) == 0
|
assert len(model_alias_list) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_dynamic_credentials():
|
||||||
|
"""
|
||||||
|
Assert model id for dynamic api key 1 != model id for dynamic api key 2
|
||||||
|
"""
|
||||||
|
original_model_id = "123"
|
||||||
|
original_api_key = "my-bad-key"
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/gpt-3.5-turbo",
|
||||||
|
"api_key": original_api_key,
|
||||||
|
"mock_response": "fake_response",
|
||||||
|
},
|
||||||
|
"model_info": {"id": original_model_id},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
deployment = router.get_deployment(model_id=original_model_id)
|
||||||
|
assert deployment is not None
|
||||||
|
assert deployment.litellm_params.api_key == original_api_key
|
||||||
|
|
||||||
|
response = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
api_key="my-bad-key-2",
|
||||||
|
)
|
||||||
|
|
||||||
|
response_2 = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
api_key="my-bad-key-3",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response_2._hidden_params["model_id"] != response._hidden_params["model_id"]
|
||||||
|
|
||||||
|
deployment = router.get_deployment(model_id=original_model_id)
|
||||||
|
assert deployment is not None
|
||||||
|
assert deployment.litellm_params.api_key == original_api_key
|
||||||
|
|
|
@ -692,3 +692,50 @@ def test_router_fallbacks_with_cooldowns_and_model_id():
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[{"role": "user", "content": "hi"}],
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_router_fallbacks_with_cooldowns_and_dynamic_credentials():
|
||||||
|
"""
|
||||||
|
Ensure cooldown on credential 1 does not affect credential 2
|
||||||
|
"""
|
||||||
|
from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
|
||||||
|
|
||||||
|
litellm._turn_on_debug()
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {"model": "gpt-3.5-turbo", "rpm": 1},
|
||||||
|
"model_info": {
|
||||||
|
"id": "123",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
## trigger ratelimit
|
||||||
|
try:
|
||||||
|
await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
api_key="my-bad-key-1",
|
||||||
|
mock_response="litellm.RateLimitError",
|
||||||
|
)
|
||||||
|
pytest.fail("Expected RateLimitError")
|
||||||
|
except litellm.RateLimitError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
cooldown_list = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router, parent_otel_span=None
|
||||||
|
)
|
||||||
|
print("cooldown_list: ", cooldown_list)
|
||||||
|
assert len(cooldown_list) == 1
|
||||||
|
|
||||||
|
await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
api_key=os.getenv("OPENAI_API_KEY"),
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
)
|
||||||
|
|
|
@ -396,3 +396,25 @@ def test_router_redis_cache():
|
||||||
router._update_redis_cache(cache=redis_cache)
|
router._update_redis_cache(cache=redis_cache)
|
||||||
|
|
||||||
assert router.cache.redis_cache == redis_cache
|
assert router.cache.redis_cache == redis_cache
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_handle_clientside_credential():
|
||||||
|
deployment = {
|
||||||
|
"model_name": "gemini/*",
|
||||||
|
"litellm_params": {"model": "gemini/*"},
|
||||||
|
"model_info": {
|
||||||
|
"id": "1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
router = Router(model_list=[deployment])
|
||||||
|
|
||||||
|
new_deployment = router._handle_clientside_credential(
|
||||||
|
deployment=deployment,
|
||||||
|
kwargs={
|
||||||
|
"api_key": "123",
|
||||||
|
"metadata": {"model_group": "gemini/gemini-1.5-flash"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert new_deployment.litellm_params.api_key == "123"
|
||||||
|
assert len(router.get_model_list()) == 2
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue