mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 03:34:10 +00:00
(feat proxy) v2 - model max budgets (#7302)
* clean up unused code * add _PROXY_VirtualKeyModelMaxBudgetLimiter * adjust type imports * working _PROXY_VirtualKeyModelMaxBudgetLimiter * fix user_api_key_model_max_budget * fix user_api_key_model_max_budget * update naming * update naming * fix changes to RouterBudgetLimiting * test_call_with_key_over_model_budget * test_call_with_key_over_model_budget * handle _get_request_model_budget_config * e2e test for test_call_with_key_over_model_budget * clean up test * run ci/cd again * add validate_model_max_budget * docs fix * update doc * add e2e testing for _PROXY_VirtualKeyModelMaxBudgetLimiter * test_unit_test_max_model_budget_limiter.py
This commit is contained in:
parent
1a4910f6c0
commit
6220e17ebf
14 changed files with 628 additions and 261 deletions
|
@ -40,7 +40,11 @@ from litellm.proxy.utils import (
|
|||
handle_exception_on_proxy,
|
||||
)
|
||||
from litellm.secret_managers.main import get_secret
|
||||
from litellm.types.utils import PersonalUIKeyGenerationConfig, TeamUIKeyGenerationConfig
|
||||
from litellm.types.utils import (
|
||||
GenericBudgetInfo,
|
||||
PersonalUIKeyGenerationConfig,
|
||||
TeamUIKeyGenerationConfig,
|
||||
)
|
||||
|
||||
|
||||
def _is_team_key(data: GenerateKeyRequest):
|
||||
|
@ -246,7 +250,7 @@ async def generate_key_fn( # noqa: PLR0915
|
|||
- metadata: Optional[dict] - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||
- guardrails: Optional[List[str]] - List of active guardrails for the key
|
||||
- permissions: Optional[dict] - key-specific permissions. Currently just used for turning off pii masking (if connected). Example - {"pii": false}
|
||||
- model_max_budget: Optional[dict] - key-specific model budget in USD. Example - {"text-davinci-002": 0.5, "gpt-3.5-turbo": 0.5}. IF null or {} then no model specific budget.
|
||||
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}}. IF null or {} then no model specific budget.
|
||||
- model_rpm_limit: Optional[dict] - key-specific model rpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific rpm limit.
|
||||
- model_tpm_limit: Optional[dict] - key-specific model tpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific tpm limit.
|
||||
- allowed_cache_controls: Optional[list] - List of allowed cache control values. Example - ["no-cache", "no-store"]. See all values - https://docs.litellm.ai/docs/proxy/caching#turn-on--off-caching-per-request
|
||||
|
@ -515,6 +519,10 @@ def prepare_key_update_data(
|
|||
|
||||
_metadata = existing_key_row.metadata or {}
|
||||
|
||||
# validate model_max_budget
|
||||
if "model_max_budget" in non_default_values:
|
||||
validate_model_max_budget(non_default_values["model_max_budget"])
|
||||
|
||||
non_default_values = prepare_metadata_fields(
|
||||
data=data, non_default_values=non_default_values, existing_metadata=_metadata
|
||||
)
|
||||
|
@ -548,7 +556,7 @@ async def update_key_fn(
|
|||
- enforced_params: Optional[List[str]] - List of enforced params for the key (Enterprise only). [Docs](https://docs.litellm.ai/docs/proxy/enterprise#enforce-required-params-for-llm-requests)
|
||||
- spend: Optional[float] - Amount spent by key
|
||||
- max_budget: Optional[float] - Max budget for key
|
||||
- model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
|
||||
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
|
||||
- budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
|
||||
- soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
|
||||
- max_parallel_requests: Optional[int] - Rate limit for parallel requests
|
||||
|
@ -1035,6 +1043,7 @@ async def generate_key_helper_fn( # noqa: PLR0915
|
|||
metadata["guardrails"] = guardrails
|
||||
|
||||
metadata_json = json.dumps(metadata)
|
||||
validate_model_max_budget(model_max_budget)
|
||||
model_max_budget_json = json.dumps(model_max_budget)
|
||||
user_role = user_role
|
||||
tpm_limit = tpm_limit
|
||||
|
@ -1266,7 +1275,7 @@ async def regenerate_key_fn(
|
|||
- tags: Optional[List[str]] - Tags for organizing keys (Enterprise only)
|
||||
- spend: Optional[float] - Amount spent by key
|
||||
- max_budget: Optional[float] - Max budget for key
|
||||
- model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
|
||||
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
|
||||
- budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
|
||||
- soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
|
||||
- max_parallel_requests: Optional[int] - Rate limit for parallel requests
|
||||
|
@ -1293,8 +1302,7 @@ async def regenerate_key_fn(
|
|||
--data-raw '{
|
||||
"max_budget": 100,
|
||||
"metadata": {"team": "core-infra"},
|
||||
"models": ["gpt-4", "gpt-3.5-turbo"],
|
||||
"model_max_budget": {"gpt-4": 50, "gpt-3.5-turbo": 50}
|
||||
"models": ["gpt-4", "gpt-3.5-turbo"]
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -1949,3 +1957,29 @@ async def _enforce_unique_key_alias(
|
|||
param="key_alias",
|
||||
code=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
|
||||
def validate_model_max_budget(model_max_budget: Optional[Dict]) -> None:
|
||||
"""
|
||||
Validate the model_max_budget is GenericBudgetConfigType
|
||||
|
||||
Raises:
|
||||
Exception: If model_max_budget is not a valid GenericBudgetConfigType
|
||||
"""
|
||||
try:
|
||||
if model_max_budget is None:
|
||||
return
|
||||
if len(model_max_budget) == 0:
|
||||
return
|
||||
if model_max_budget is not None:
|
||||
for _model, _budget_info in model_max_budget.items():
|
||||
assert isinstance(_model, str)
|
||||
|
||||
# /CRUD endpoints can pass budget_limit as a string, so we need to convert it to a float
|
||||
if "budget_limit" in _budget_info:
|
||||
_budget_info["budget_limit"] = float(_budget_info["budget_limit"])
|
||||
GenericBudgetInfo(**_budget_info)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Invalid model_max_budget: {str(e)}. Example of valid model_max_budget: https://docs.litellm.ai/docs/proxy/users"
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue