(feat proxy) v2 - model max budgets (#7302)

* clean up unused code

* add _PROXY_VirtualKeyModelMaxBudgetLimiter

* adjust type imports

* working _PROXY_VirtualKeyModelMaxBudgetLimiter

* fix user_api_key_model_max_budget

* fix user_api_key_model_max_budget

* update naming

* update naming

* fix changes to RouterBudgetLimiting

* test_call_with_key_over_model_budget

* test_call_with_key_over_model_budget

* handle _get_request_model_budget_config

* e2e test for test_call_with_key_over_model_budget

* clean up test

* run ci/cd again

* add validate_model_max_budget

* docs fix

* update doc

* add e2e testing for _PROXY_VirtualKeyModelMaxBudgetLimiter

* test_unit_test_max_model_budget_limiter.py
This commit is contained in:
Ishaan Jaff 2024-12-18 19:42:46 -08:00 committed by GitHub
parent 1a4910f6c0
commit 6220e17ebf
14 changed files with 628 additions and 261 deletions

View file

@ -40,7 +40,11 @@ from litellm.proxy.utils import (
handle_exception_on_proxy,
)
from litellm.secret_managers.main import get_secret
from litellm.types.utils import PersonalUIKeyGenerationConfig, TeamUIKeyGenerationConfig
from litellm.types.utils import (
GenericBudgetInfo,
PersonalUIKeyGenerationConfig,
TeamUIKeyGenerationConfig,
)
def _is_team_key(data: GenerateKeyRequest):
@ -246,7 +250,7 @@ async def generate_key_fn( # noqa: PLR0915
- metadata: Optional[dict] - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
- guardrails: Optional[List[str]] - List of active guardrails for the key
- permissions: Optional[dict] - key-specific permissions. Currently just used for turning off pii masking (if connected). Example - {"pii": false}
- model_max_budget: Optional[dict] - key-specific model budget in USD. Example - {"text-davinci-002": 0.5, "gpt-3.5-turbo": 0.5}. IF null or {} then no model specific budget.
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}}. IF null or {} then no model specific budget.
- model_rpm_limit: Optional[dict] - key-specific model rpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific rpm limit.
- model_tpm_limit: Optional[dict] - key-specific model tpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific tpm limit.
- allowed_cache_controls: Optional[list] - List of allowed cache control values. Example - ["no-cache", "no-store"]. See all values - https://docs.litellm.ai/docs/proxy/caching#turn-on--off-caching-per-request
@ -515,6 +519,10 @@ def prepare_key_update_data(
_metadata = existing_key_row.metadata or {}
# validate model_max_budget
if "model_max_budget" in non_default_values:
validate_model_max_budget(non_default_values["model_max_budget"])
non_default_values = prepare_metadata_fields(
data=data, non_default_values=non_default_values, existing_metadata=_metadata
)
@ -548,7 +556,7 @@ async def update_key_fn(
- enforced_params: Optional[List[str]] - List of enforced params for the key (Enterprise only). [Docs](https://docs.litellm.ai/docs/proxy/enterprise#enforce-required-params-for-llm-requests)
- spend: Optional[float] - Amount spent by key
- max_budget: Optional[float] - Max budget for key
- model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
- budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
- soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
- max_parallel_requests: Optional[int] - Rate limit for parallel requests
@ -1035,6 +1043,7 @@ async def generate_key_helper_fn( # noqa: PLR0915
metadata["guardrails"] = guardrails
metadata_json = json.dumps(metadata)
validate_model_max_budget(model_max_budget)
model_max_budget_json = json.dumps(model_max_budget)
user_role = user_role
tpm_limit = tpm_limit
@ -1266,7 +1275,7 @@ async def regenerate_key_fn(
- tags: Optional[List[str]] - Tags for organizing keys (Enterprise only)
- spend: Optional[float] - Amount spent by key
- max_budget: Optional[float] - Max budget for key
- model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
- model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
- budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
- soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
- max_parallel_requests: Optional[int] - Rate limit for parallel requests
@ -1293,8 +1302,7 @@ async def regenerate_key_fn(
--data-raw '{
"max_budget": 100,
"metadata": {"team": "core-infra"},
"models": ["gpt-4", "gpt-3.5-turbo"],
"model_max_budget": {"gpt-4": 50, "gpt-3.5-turbo": 50}
"models": ["gpt-4", "gpt-3.5-turbo"]
}'
```
@ -1949,3 +1957,29 @@ async def _enforce_unique_key_alias(
param="key_alias",
code=status.HTTP_400_BAD_REQUEST,
)
def validate_model_max_budget(model_max_budget: Optional[Dict]) -> None:
"""
Validate the model_max_budget is GenericBudgetConfigType
Raises:
Exception: If model_max_budget is not a valid GenericBudgetConfigType
"""
try:
if model_max_budget is None:
return
if len(model_max_budget) == 0:
return
if model_max_budget is not None:
for _model, _budget_info in model_max_budget.items():
assert isinstance(_model, str)
# /CRUD endpoints can pass budget_limit as a string, so we need to convert it to a float
if "budget_limit" in _budget_info:
_budget_info["budget_limit"] = float(_budget_info["budget_limit"])
GenericBudgetInfo(**_budget_info)
except Exception as e:
raise ValueError(
f"Invalid model_max_budget: {str(e)}. Example of valid model_max_budget: https://docs.litellm.ai/docs/proxy/users"
)