Merge pull request #1859 from BerriAI/litellm_allow_using_budgets_without_keys

[Feat] Budgets for 'user' param passed to /chat/completions, /embeddings etc
2024-02-06 16:32:25 -08:00 · 2024-02-06 16:32:25 -08:00 · 73c6ce890b
commit 73c6ce890b
parent 0fd64bc906 6369424629
7 changed files with 187 additions and 7 deletions
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -13,6 +13,7 @@ Requirements:
 You can set budgets at 3 levels: 
 - For the proxy 
 - For a user 
+- For a 'user' passed to `/chat/completions`, `/embeddings` etc
 - For a key


@ -117,6 +118,61 @@ curl --location 'http://0.0.0.0:8000/key/generate' \
 --data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
 ```

+</TabItem>
+<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+
+Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
+
+**Step 1. Modify config.yaml**
+Define `litellm.max_user_budget`
+```yaml
+general_settings:
+  master_key: sk-1234
+
+litellm_settings:
+  max_budget: 10      # global budget for proxy 
+  max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
+```
+
+2. Make a /chat/completions call, pass 'user' - First call Works 
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+Error
+```shell
+{"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%                
+```
+
 </TabItem>
 <TabItem value="per-key" label="For Key">

--- a/litellm/init.py
+++ b/litellm/init.py
@ -148,6 +148,7 @@ s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
 upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
+max_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -68,6 +68,7 @@ litellm_settings:
  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
  success_callback: ['langfuse']
  max_budget: 10      # global budget for proxy 
+  max_user_budget: 0.0001
  budget_duration: 30d    # global budget duration, will reset after 30d
  default_key_generate_params:
    max_budget: 1.5000
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -91,6 +91,7 @@ from litellm.proxy.utils import (
    reset_budget,
    hash_token,
    html_form,
+    _read_request_body,
 )
 from litellm.proxy.secret_managers.google_kms import load_google_kms
 import pydantic
@ -370,8 +371,9 @@ async def user_api_key_auth(
            # Run checks for
            # 1. If token can call model
            # 2. If user_id for this token is in budget
-            # 3. If token is expired
-            # 4. If token spend is under Budget for the token
+            # 3. If 'user' passed to /chat/completions, /embeddings endpoint is in budget
+            # 4. If token is expired
+            # 5. If token spend is under Budget for the token

            # Check 1. If token can call model
            litellm.model_alias_map = valid_token.aliases
@ -430,11 +432,24 @@ async def user_api_key_auth(
                )

            # Check 2. If user_id for this token is in budget
-            ## Check 2.5 If global proxy is in budget
+            ## Check 2.1 If global proxy is in budget
+            ## Check 2.2 [OPTIONAL - checked only if litellm.max_user_budget is not None] If 'user' passed in /chat/completions is in budget
            if valid_token.user_id is not None:
+                user_id_list = [
+                    valid_token.user_id,
+                    litellm_proxy_budget_name,
+                ]
+                if (
+                    litellm.max_user_budget is not None
+                ):  # Check if 'user' passed in /chat/completions is in budget, only checked if litellm.max_user_budget is set
+                    request_data = await _read_request_body(request=request)
+                    user_passed_to_chat_completions = request_data.get("user", None)
+                    if user_passed_to_chat_completions is not None:
+                        user_id_list.append(user_passed_to_chat_completions)
+
                if prisma_client is not None:
                    user_id_information = await prisma_client.get_data(
-                        user_id_list=[valid_token.user_id, litellm_proxy_budget_name],
+                        user_id_list=user_id_list,
                        table_name="user",
                        query_type="find_all",
                    )
@ -459,7 +474,7 @@ async def user_api_key_auth(
                            user_current_spend = _user.get("spend", None)

                            verbose_proxy_logger.debug(
-                                f"user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
+                                f"user_id: {_user.get('user_id', None)}; user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
                            )

                            if (
@ -852,9 +867,13 @@ async def update_database(
                    f"Updating existing_spend_obj: {existing_spend_obj}"
                )
                if existing_spend_obj is None:
+                    # if user does not exist in LiteLLM_UserTable, create a new user
                    existing_spend = 0
+                    max_user_budget = None
+                    if litellm.max_user_budget is not None:
+                        max_user_budget = litellm.max_user_budget
                    existing_spend_obj = LiteLLM_UserTable(
-                        user_id=id, spend=0, max_budget=None, user_email=None
+                        user_id=id, spend=0, max_budget=max_user_budget, user_email=None
                    )
                else:
                    existing_spend = existing_spend_obj.spend
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -1234,6 +1234,28 @@ async def reset_budget(prisma_client: PrismaClient):
            )


+async def _read_request_body(request):
+    """
+    Asynchronous function to read the request body and parse it as JSON or literal data.
+
+    Parameters:
+    - request: The request object to read the body from
+
+    Returns:
+    - dict: Parsed request data as a dictionary
+    """
+    import ast, json
+
+    request_data = {}
+    body = await request.body()
+    body_str = body.decode()
+    try:
+        request_data = ast.literal_eval(body_str)
+    except:
+        request_data = json.loads(body_str)
+    return request_data
+
+
 # LiteLLM Admin UI - Non SSO Login
 html_form = """
 <!DOCTYPE html>
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -1743,7 +1743,7 @@ def test_azure_cloudflare_api():

 def test_completion_anyscale_2():
    try:
-        # litellm.set_verbose=True
+        # litellm.set_verbose= True
        messages = [
            {"role": "system", "content": "You're a good bot"},
            {
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -322,6 +322,87 @@ def test_call_with_user_over_budget(prisma_client):
        print(vars(e))


+def test_call_with_end_user_over_budget(prisma_client):
+    # Test if a user passed to /chat/completions is tracked & fails whe they cross their budget
+    # we only check this when litellm.max_user_budget is set
+    import random
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm, "max_user_budget", 0.00001)
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            request = GenerateKeyRequest()  # create a key with no budget
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+            bearer_token = "Bearer " + generated_key
+            user = f"ishaan {random.randint(0, 10000)}"
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            async def return_body():
+                return_string = f'{{"model": "gemini-pro-vision", "user": "{user}"}}'
+                # return string as bytes
+                return return_string.encode()
+
+            request.body = return_body
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import (
+                _PROXY_track_cost_callback as track_cost_callback,
+            )
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": False,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": generated_key,
+                            "user_api_key_user_id": user,
+                        },
+                        "proxy_server_request": {
+                            "user": user,
+                        },
+                    },
+                    "response_cost": 10,
+                },
+                completion_response=resp,
+                start_time=datetime.now(),
+                end_time=datetime.now(),
+            )
+            await asyncio.sleep(5)
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
+        assert "Authentication Error, ExceededBudget:" in error_detail
+        print(vars(e))
+
+
 def test_call_with_proxy_over_budget(prisma_client):
    # 5.1 Make a call with a proxy over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)