Merge pull request #1859 from BerriAI/litellm_allow_using_budgets_without_keys

[Feat] Budgets for 'user' param passed to /chat/completions, /embeddings etc
2024-02-06 16:32:25 -08:00 · 2024-02-06 16:32:25 -08:00 · 73c6ce890b
commit 73c6ce890b
parent 0fd64bc906 6369424629
7 changed files with 187 additions and 7 deletions
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -13,6 +13,7 @@ Requirements:
 You can set budgets at 3 levels: 
 - For the proxy 
 - For a user 
 - For a 'user' passed to `/chat/completions`, `/embeddings` etc
 - For a key
@ -117,6 +118,61 @@ curl --location 'http://0.0.0.0:8000/key/generate' \
 --data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
 ```
 </TabItem>
 <TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
 Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
 **Step 1. Modify config.yaml**
 Define `litellm.max_user_budget`
 ```yaml
 general_settings:
  master_key: sk-1234
 litellm_settings:
  max_budget: 10      # global budget for proxy 
  max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
 ```
 2. Make a /chat/completions call, pass 'user' - First call Works 
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
        --header 'Content-Type: application/json' \
        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
        --data ' {
        "model": "azure-gpt-3.5",
        "user": "ishaan3",
        "messages": [
            {
            "role": "user",
            "content": "what time is it"
            }
        ]
        }'
 ```
 3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
        --header 'Content-Type: application/json' \
        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
        --data ' {
        "model": "azure-gpt-3.5",
        "user": "ishaan3",
        "messages": [
            {
            "role": "user",
            "content": "what time is it"
            }
        ]
        }'
 ```
 Error
 ```shell
 {"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%                
 ```
 </TabItem>
 <TabItem value="per-key" label="For Key">
--- a/litellm/init.py
+++ b/litellm/init.py
@ -148,6 +148,7 @@ s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
 upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -68,6 +68,7 @@ litellm_settings:
  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
  success_callback: ['langfuse']
  max_budget: 10      # global budget for proxy 
  max_user_budget: 0.0001
  budget_duration: 30d    # global budget duration, will reset after 30d
  default_key_generate_params:
    max_budget: 1.5000
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -91,6 +91,7 @@ from litellm.proxy.utils import (
    reset_budget,
    hash_token,
    html_form,
    _read_request_body,
 )
 from litellm.proxy.secret_managers.google_kms import load_google_kms
 import pydantic
@ -370,8 +371,9 @@ async def user_api_key_auth(
            # Run checks for
            # 1. If token can call model
            # 2. If user_id for this token is in budget
-            # 3. If token is expired
+            # 3. If 'user' passed to /chat/completions, /embeddings endpoint is in budget
-            # 4. If token spend is under Budget for the token
+            # 4. If token is expired
            # 5. If token spend is under Budget for the token
            # Check 1. If token can call model
            litellm.model_alias_map = valid_token.aliases
@ -430,11 +432,24 @@ async def user_api_key_auth(
                )
            # Check 2. If user_id for this token is in budget
-            ## Check 2.5 If global proxy is in budget
+            ## Check 2.1 If global proxy is in budget
            ## Check 2.2 [OPTIONAL - checked only if litellm.max_user_budget is not None] If 'user' passed in /chat/completions is in budget
            if valid_token.user_id is not None:
                user_id_list = [
                    valid_token.user_id,
                    litellm_proxy_budget_name,
                ]
                if (
                    litellm.max_user_budget is not None
                ):  # Check if 'user' passed in /chat/completions is in budget, only checked if litellm.max_user_budget is set
                    request_data = await _read_request_body(request=request)
                    user_passed_to_chat_completions = request_data.get("user", None)
                    if user_passed_to_chat_completions is not None:
                        user_id_list.append(user_passed_to_chat_completions)
                if prisma_client is not None:
                    user_id_information = await prisma_client.get_data(
-                        user_id_list=[valid_token.user_id, litellm_proxy_budget_name],
+                        user_id_list=user_id_list,
                        table_name="user",
                        query_type="find_all",
                    )
@ -459,7 +474,7 @@ async def user_api_key_auth(
                            user_current_spend = _user.get("spend", None)
                            verbose_proxy_logger.debug(
-                                f"user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
+                                f"user_id: {_user.get('user_id', None)}; user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
                            )
                            if (
@ -852,9 +867,13 @@ async def update_database(
                    f"Updating existing_spend_obj: {existing_spend_obj}"
                )
                if existing_spend_obj is None:
                    # if user does not exist in LiteLLM_UserTable, create a new user
                    existing_spend = 0
                    max_user_budget = None
                    if litellm.max_user_budget is not None:
                        max_user_budget = litellm.max_user_budget
                    existing_spend_obj = LiteLLM_UserTable(
-                        user_id=id, spend=0, max_budget=None, user_email=None
+                        user_id=id, spend=0, max_budget=max_user_budget, user_email=None
                    )
                else:
                    existing_spend = existing_spend_obj.spend
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -1234,6 +1234,28 @@ async def reset_budget(prisma_client: PrismaClient):
            )
 async def _read_request_body(request):
    """
    Asynchronous function to read the request body and parse it as JSON or literal data.
    Parameters:
    - request: The request object to read the body from
    Returns:
    - dict: Parsed request data as a dictionary
    """
    import ast, json
    request_data = {}
    body = await request.body()
    body_str = body.decode()
    try:
        request_data = ast.literal_eval(body_str)
    except:
        request_data = json.loads(body_str)
    return request_data
 # LiteLLM Admin UI - Non SSO Login
 html_form = """
 <!DOCTYPE html>
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -322,6 +322,87 @@ def test_call_with_user_over_budget(prisma_client):
        print(vars(e))
 def test_call_with_end_user_over_budget(prisma_client):
    # Test if a user passed to /chat/completions is tracked & fails whe they cross their budget
    # we only check this when litellm.max_user_budget is set
    import random
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    setattr(litellm, "max_user_budget", 0.00001)
    try:
        async def test():
            await litellm.proxy.proxy_server.prisma_client.connect()
            request = GenerateKeyRequest()  # create a key with no budget
            key = await new_user(request)
            print(key)
            generated_key = key.key
            bearer_token = "Bearer " + generated_key
            user = f"ishaan {random.randint(0, 10000)}"
            request = Request(scope={"type": "http"})
            request._url = URL(url="/chat/completions")
            async def return_body():
                return_string = f'{{"model": "gemini-pro-vision", "user": "{user}"}}'
                # return string as bytes
                return return_string.encode()
            request.body = return_body
            # update spend using track_cost callback, make 2nd request, it should fail
            from litellm.proxy.proxy_server import (
                _PROXY_track_cost_callback as track_cost_callback,
            )
            from litellm import ModelResponse, Choices, Message, Usage
            resp = ModelResponse(
                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
                choices=[
                    Choices(
                        finish_reason=None,
                        index=0,
                        message=Message(
                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
                            role="assistant",
                        ),
                    )
                ],
                model="gpt-35-turbo",  # azure always has model written like this
                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
            )
            await track_cost_callback(
                kwargs={
                    "stream": False,
                    "litellm_params": {
                        "metadata": {
                            "user_api_key": generated_key,
                            "user_api_key_user_id": user,
                        },
                        "proxy_server_request": {
                            "user": user,
                        },
                    },
                    "response_cost": 10,
                },
                completion_response=resp,
                start_time=datetime.now(),
                end_time=datetime.now(),
            )
            await asyncio.sleep(5)
            # use generated key to auth in
            result = await user_api_key_auth(request=request, api_key=bearer_token)
            print("result from user auth with new key", result)
            pytest.fail(f"This should have failed!. They key crossed it's budget")
        asyncio.run(test())
    except Exception as e:
        error_detail = e.message
        assert "Authentication Error, ExceededBudget:" in error_detail
        print(vars(e))
 def test_call_with_proxy_over_budget(prisma_client):
    # 5.1 Make a call with a proxy over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)