(feat proxy) v2 - model max budgets (#7302)

* clean up unused code * add _PROXY_VirtualKeyModelMaxBudgetLimiter * adjust type imports * working _PROXY_VirtualKeyModelMaxBudgetLimiter * fix user_api_key_model_max_budget * fix user_api_key_model_max_budget * update naming * update naming * fix changes to RouterBudgetLimiting * test_call_with_key_over_model_budget * test_call_with_key_over_model_budget * handle _get_request_model_budget_config * e2e test for test_call_with_key_over_model_budget * clean up test * run ci/cd again * add validate_model_max_budget * docs fix * update doc * add e2e testing for _PROXY_VirtualKeyModelMaxBudgetLimiter * test_unit_test_max_model_budget_limiter.py
2025-04-27 03:34:10 +00:00 · 2024-12-18 19:42:46 -08:00 · 2024-12-18 19:42:46 -08:00 · 6220e17ebf
commit 6220e17ebf
parent 1a4910f6c0
14 changed files with 628 additions and 261 deletions
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -40,7 +40,11 @@ from litellm.proxy.utils import (
    handle_exception_on_proxy,
 )
 from litellm.secret_managers.main import get_secret
-from litellm.types.utils import PersonalUIKeyGenerationConfig, TeamUIKeyGenerationConfig
+from litellm.types.utils import (
+    GenericBudgetInfo,
+    PersonalUIKeyGenerationConfig,
+    TeamUIKeyGenerationConfig,
+)


 def _is_team_key(data: GenerateKeyRequest):
@ -246,7 +250,7 @@ async def generate_key_fn(  # noqa: PLR0915
    - metadata: Optional[dict] - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
    - guardrails: Optional[List[str]] - List of active guardrails for the key
    - permissions: Optional[dict] - key-specific permissions. Currently just used for turning off pii masking (if connected). Example - {"pii": false}
-    - model_max_budget: Optional[dict] - key-specific model budget in USD. Example - {"text-davinci-002": 0.5, "gpt-3.5-turbo": 0.5}. IF null or {} then no model specific budget.
+    - model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}}. IF null or {} then no model specific budget.
    - model_rpm_limit: Optional[dict] - key-specific model rpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific rpm limit.
    - model_tpm_limit: Optional[dict] - key-specific model tpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific tpm limit.
    - allowed_cache_controls: Optional[list] - List of allowed cache control values. Example - ["no-cache", "no-store"]. See all values - https://docs.litellm.ai/docs/proxy/caching#turn-on--off-caching-per-request
@ -515,6 +519,10 @@ def prepare_key_update_data(

    _metadata = existing_key_row.metadata or {}

+    # validate model_max_budget
+    if "model_max_budget" in non_default_values:
+        validate_model_max_budget(non_default_values["model_max_budget"])
+
    non_default_values = prepare_metadata_fields(
        data=data, non_default_values=non_default_values, existing_metadata=_metadata
    )
@ -548,7 +556,7 @@ async def update_key_fn(
    - enforced_params: Optional[List[str]] - List of enforced params for the key (Enterprise only). [Docs](https://docs.litellm.ai/docs/proxy/enterprise#enforce-required-params-for-llm-requests)
    - spend: Optional[float] - Amount spent by key
    - max_budget: Optional[float] - Max budget for key
-    - model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
+    - model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
    - budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
    - soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
    - max_parallel_requests: Optional[int] - Rate limit for parallel requests
@ -1035,6 +1043,7 @@ async def generate_key_helper_fn(  # noqa: PLR0915
        metadata["guardrails"] = guardrails

    metadata_json = json.dumps(metadata)
+    validate_model_max_budget(model_max_budget)
    model_max_budget_json = json.dumps(model_max_budget)
    user_role = user_role
    tpm_limit = tpm_limit
@ -1266,7 +1275,7 @@ async def regenerate_key_fn(
        - tags: Optional[List[str]] - Tags for organizing keys (Enterprise only)
        - spend: Optional[float] - Amount spent by key
        - max_budget: Optional[float] - Max budget for key
-        - model_max_budget: Optional[dict] - Model-specific budgets {"gpt-4": 0.5, "claude-v1": 1.0}
+        - model_max_budget: Optional[Dict[str, GenericBudgetInfo]] - Model-specific budgets {"gpt-4": {"budget_limit": 0.0005, "time_period": "30d"}}
        - budget_duration: Optional[str] - Budget reset period ("30d", "1h", etc.)
        - soft_budget: Optional[float] - Soft budget limit (warning vs. hard stop). Will trigger a slack alert when this soft budget is reached.
        - max_parallel_requests: Optional[int] - Rate limit for parallel requests
@ -1293,8 +1302,7 @@ async def regenerate_key_fn(
    --data-raw '{
        "max_budget": 100,
        "metadata": {"team": "core-infra"},
-        "models": ["gpt-4", "gpt-3.5-turbo"],
-        "model_max_budget": {"gpt-4": 50, "gpt-3.5-turbo": 50}
+        "models": ["gpt-4", "gpt-3.5-turbo"]
    }'
    ```

@ -1949,3 +1957,29 @@ async def _enforce_unique_key_alias(
                param="key_alias",
                code=status.HTTP_400_BAD_REQUEST,
            )
+
+
+def validate_model_max_budget(model_max_budget: Optional[Dict]) -> None:
+    """
+    Validate the model_max_budget is GenericBudgetConfigType
+
+    Raises:
+        Exception: If model_max_budget is not a valid GenericBudgetConfigType
+    """
+    try:
+        if model_max_budget is None:
+            return
+        if len(model_max_budget) == 0:
+            return
+        if model_max_budget is not None:
+            for _model, _budget_info in model_max_budget.items():
+                assert isinstance(_model, str)
+
+                # /CRUD endpoints can pass budget_limit as a string, so we need to convert it to a float
+                if "budget_limit" in _budget_info:
+                    _budget_info["budget_limit"] = float(_budget_info["budget_limit"])
+                GenericBudgetInfo(**_budget_info)
+    except Exception as e:
+        raise ValueError(
+            f"Invalid model_max_budget: {str(e)}. Example of valid model_max_budget: https://docs.litellm.ai/docs/proxy/users"
+        )