forked from phoenix/litellm-mirror
LiteLLM Minor Fixes & Improvements (11/26/2024) (#6913)
* docs(config_settings.md): document all router_settings * ci(config.yml): add router_settings doc test to ci/cd * test: debug test on ci/cd * test: debug ci/cd test * test: fix test * fix(team_endpoints.py): skip invalid team object. don't fail `/team/list` call Causes downstream errors if ui just fails to load team list * test(base_llm_unit_tests.py): add 'response_format={"type": "text"}' test to base_llm_unit_tests adds complete coverage for all 'response_format' values to ci/cd * feat(router.py): support wildcard routes in `get_router_model_info()` Addresses https://github.com/BerriAI/litellm/issues/6914 * build(model_prices_and_context_window.json): add tpm/rpm limits for all gemini models Allows for ratelimit tracking for gemini models even with wildcard routing enabled Addresses https://github.com/BerriAI/litellm/issues/6914 * feat(router.py): add tpm/rpm tracking on success/failure to global_router Addresses https://github.com/BerriAI/litellm/issues/6914 * feat(router.py): support wildcard routes on router.get_model_group_usage() * fix(router.py): fix linting error * fix(router.py): implement get_remaining_tokens_and_requests Addresses https://github.com/BerriAI/litellm/issues/6914 * fix(router.py): fix linting errors * test: fix test * test: fix tests * docs(config_settings.md): add missing dd env vars to docs * fix(router.py): check if hidden params is dict
This commit is contained in:
parent
5d13302e6b
commit
2d2931a215
22 changed files with 878 additions and 131 deletions
|
@ -174,3 +174,185 @@ async def test_update_kwargs_before_fallbacks(call_type):
|
|||
|
||||
print(mock_client.call_args.kwargs)
|
||||
assert mock_client.call_args.kwargs["litellm_trace_id"] is not None
|
||||
|
||||
|
||||
def test_router_get_model_info_wildcard_routes():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
]
|
||||
)
|
||||
model_info = router.get_router_model_info(
|
||||
deployment=None, received_model_name="gemini/gemini-1.5-flash", id="1"
|
||||
)
|
||||
print(model_info)
|
||||
assert model_info is not None
|
||||
assert model_info["tpm"] is not None
|
||||
assert model_info["rpm"] is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_get_model_group_usage_wildcard_routes():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
resp = await router.acompletion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||
mock_response="Hello, I'm good.",
|
||||
)
|
||||
print(resp)
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
tpm, rpm = await router.get_model_group_usage(model_group="gemini/gemini-1.5-flash")
|
||||
|
||||
assert tpm is not None, "tpm is None"
|
||||
assert rpm is not None, "rpm is None"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_router_callbacks_on_success():
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
router.cache, "async_increment_cache", new=AsyncMock()
|
||||
) as mock_callback:
|
||||
await router.acompletion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||
mock_response="Hello, I'm good.",
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
assert mock_callback.call_count == 2
|
||||
|
||||
assert (
|
||||
mock_callback.call_args_list[0]
|
||||
.kwargs["key"]
|
||||
.startswith("global_router:1:gemini/gemini-1.5-flash:tpm")
|
||||
)
|
||||
assert (
|
||||
mock_callback.call_args_list[1]
|
||||
.kwargs["key"]
|
||||
.startswith("global_router:1:gemini/gemini-1.5-flash:rpm")
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_router_callbacks_on_failure():
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
router.cache, "async_increment_cache", new=AsyncMock()
|
||||
) as mock_callback:
|
||||
with pytest.raises(litellm.RateLimitError):
|
||||
await router.acompletion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||
mock_response="litellm.RateLimitError",
|
||||
num_retries=0,
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
print(mock_callback.call_args_list)
|
||||
assert mock_callback.call_count == 1
|
||||
|
||||
assert (
|
||||
mock_callback.call_args_list[0]
|
||||
.kwargs["key"]
|
||||
.startswith("global_router:1:gemini/gemini-1.5-flash:rpm")
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_model_group_headers():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
for _ in range(2):
|
||||
resp = await router.acompletion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||
mock_response="Hello, I'm good.",
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
assert (
|
||||
resp._hidden_params["additional_headers"]["x-litellm-model-group"]
|
||||
== "gemini/gemini-1.5-flash"
|
||||
)
|
||||
|
||||
assert "x-ratelimit-remaining-requests" in resp._hidden_params["additional_headers"]
|
||||
assert "x-ratelimit-remaining-tokens" in resp._hidden_params["additional_headers"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_remaining_model_group_usage():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gemini/*",
|
||||
"litellm_params": {"model": "gemini/*"},
|
||||
"model_info": {"id": 1},
|
||||
}
|
||||
]
|
||||
)
|
||||
for _ in range(2):
|
||||
await router.acompletion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||
mock_response="Hello, I'm good.",
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
remaining_usage = await router.get_remaining_model_group_usage(
|
||||
model_group="gemini/gemini-1.5-flash"
|
||||
)
|
||||
assert remaining_usage is not None
|
||||
assert "x-ratelimit-remaining-requests" in remaining_usage
|
||||
assert "x-ratelimit-remaining-tokens" in remaining_usage
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue