Merge branch 'main' into litellm_organization_table

2024-03-02 16:09:28 -08:00 · 2024-03-02 16:09:28 -08:00 · eaccbf26b7
commit eaccbf26b7
parent 96f11157c9 b30cbd0d55
10 changed files with 146 additions and 75 deletions
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
  "metadata": {"user": "ishaan@berri.ai"},
  "team_id": "core-infra",
  "max_budget": 10,
+  "soft_budget": 5,
 }'
 ```

@ -93,6 +94,7 @@ Request Params:
 - `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
 - `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
 - `max_budget`: *Optional[float]* - Specify max budget for a given key.
+- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
 - `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
 - `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
 - `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
--- a/litellm/init.py
+++ b/litellm/init.py
@ -79,6 +79,9 @@ max_budget: float = 0.0  # set the max budget across all providers
 budget_duration: Optional[str] = (
    None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 )
+default_soft_budget: float = (
+    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+)
 _openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
 _openai_completion_params = [
    "functions",
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -151,6 +151,7 @@ class GenerateRequestBase(LiteLLMBase):
    rpm_limit: Optional[int] = None
    budget_duration: Optional[str] = None
    allowed_cache_controls: Optional[list] = []
+    soft_budget: Optional[float] = None


 class GenerateKeyRequest(GenerateRequestBase):
@ -327,7 +328,7 @@ class TeamRequest(LiteLLMBase):

 class LiteLLM_BudgetTable(LiteLLMBase):
    """Represents user-controllable params for a LiteLLM_BudgetTable record"""
-
+    soft_budget: Optional[float] = None
    max_budget: Optional[float] = None
    max_parallel_requests: Optional[int] = None
    tpm_limit: Optional[int] = None
@ -585,6 +586,7 @@ class LiteLLM_SpendLogs(LiteLLMBase):
    request_id: str
    api_key: str
    model: Optional[str] = ""
+    api_base: Optional[str] = ""
    call_type: str
    spend: Optional[float] = 0.0
    total_tokens: Optional[int] = 0
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -791,6 +791,7 @@ async def user_api_key_auth(
                "/global/spend/keys",
                "/global/spend/models",
                "/global/predict/spend/logs",
+                "/health/services",
            ]
            # check if the current route startswith any of the allowed routes
            if (
@ -1814,6 +1815,9 @@ async def generate_key_helper_fn(
    spend: float,
    key_max_budget: Optional[float] = None,  # key_max_budget is used to Budget Per key
    key_budget_duration: Optional[str] = None,
+    key_soft_budget: Optional[
+        float
+    ] = None,  # key_soft_budget is used to Budget Per key
    max_budget: Optional[float] = None,  # max_budget is used to Budget Per user
    budget_duration: Optional[str] = None,  # max_budget is used to Budget Per user
    token: Optional[str] = None,
@ -1873,6 +1877,19 @@ async def generate_key_helper_fn(
    rpm_limit = rpm_limit
    allowed_cache_controls = allowed_cache_controls

+    # TODO: @ishaan-jaff: Migrate all budget tracking to use LiteLLM_BudgetTable
+    if prisma_client is not None:
+        # create the Budget Row for the LiteLLM Verification Token
+        budget_row = LiteLLM_BudgetTable(
+            soft_budget=key_soft_budget or litellm.default_soft_budget,
+            model_max_budget=model_max_budget or {},
+            created_by=user_id,
+            updated_by=user_id,
+        )
+        new_budget = prisma_client.jsonify_object(budget_row.json(exclude_none=True))
+        _budget = await prisma_client.db.litellm_budgettable.create(data={**new_budget})  # type: ignore
+        _budget_id = getattr(_budget, "id", None)
+
    try:
        # Create a new verification token (you may want to enhance this logic based on your needs)
        user_data = {
@ -1910,6 +1927,7 @@ async def generate_key_helper_fn(
            "allowed_cache_controls": allowed_cache_controls,
            "permissions": permissions_json,
            "model_max_budget": model_max_budget_json,
+            "budget_id": _budget_id,
        }
        if (
            general_settings.get("allow_user_auth", False) == True
@ -1982,6 +2000,9 @@ async def generate_key_helper_fn(
    except Exception as e:
        traceback.print_exc()
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+    # Add budget related info in key_data - this ensures it's returned
+    key_data["soft_budget"] = key_soft_budget
    return key_data


@ -2142,14 +2163,6 @@ async def async_data_generator(response, user_api_key_dict):
            except Exception as e:
                yield f"data: {str(e)}\n\n"

-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
        # Streaming is done, yield the [DONE] chunk
        done_message = "[DONE]"
        yield f"data: {done_message}\n\n"
@ -2497,14 +2510,6 @@ async def completion(
                headers=custom_headers,
            )

-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
        fastapi_response.headers["x-litellm-model-id"] = model_id
        return response
    except Exception as e:
@ -2703,14 +2708,6 @@ async def chat_completion(
                headers=custom_headers,
            )

-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
        fastapi_response.headers["x-litellm-model-id"] = model_id

        ### CALL HOOKS ### - modify outgoing data
@ -2918,12 +2915,6 @@ async def embeddings(

        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )

        return response
    except Exception as e:
@ -3069,12 +3060,6 @@ async def image_generation(

        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )

        return response
    except Exception as e:
@ -3228,12 +3213,6 @@ async def moderations(

        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )

        return response
    except Exception as e:
@ -3378,6 +3357,8 @@ async def generate_key_fn(
        # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
        if "max_budget" in data_json:
            data_json["key_max_budget"] = data_json.pop("max_budget", None)
+        if "soft_budget" in data_json:
+            data_json["key_soft_budget"] = data_json.pop("soft_budget", None)

        if "budget_duration" in data_json:
            data_json["key_budget_duration"] = data_json.pop("budget_duration", None)
@ -6722,6 +6703,50 @@ async def test_endpoint(request: Request):
    return {"route": request.url.path}


+@router.get(
+    "/health/services",
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+    include_in_schema=False,
+)
+async def health_services_endpoint(
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    service: Literal["slack_budget_alerts"] = fastapi.Query(
+        description="Specify the service being hit."
+    ),
+):
+    """
+    Hidden endpoint.
+
+    Used by the UI to let user check if slack alerting is working as expected.
+    """
+    global general_settings, proxy_logging_obj
+
+    if service is None:
+        raise HTTPException(
+            status_code=400, detail={"error": "Service must be specified."}
+        )
+
+    if service not in ["slack_budget_alerts"]:
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": f"Service must be in list. Service={service}. List={['slack_budget_alerts']}"
+            },
+        )
+
+    test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` my-secret-project \n`Expected Day of Error`: 28th March \n`Current Spend`: 100 \n`Projected Spend at end of month`: 1000 \n
+    """
+
+    if "slack" in general_settings.get("alerting", []):
+        await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
+    else:
+        raise HTTPException(
+            status_code=422,
+            detail={"error": "No slack connection setup. Unable to test this."},
+        )
+
+
@router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
 async def health_endpoint(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -11,10 +11,11 @@ generator client {
 model LiteLLM_BudgetTable {
  budget_id String @id @default(uuid())
  max_budget Float?
+  soft_budget Float?
  max_parallel_requests Int?
  tpm_limit     BigInt?
  rpm_limit     BigInt?
-  model_max_budget Json @default("{}")
+  model_max_budget Json?
  budget_duration String? 
  budget_reset_at DateTime?
  created_at    DateTime               @default(now()) @map("created_at")
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
    allowed_cache_controls String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    budget_id String?
 }

 // store proxy config.yaml
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
  startTime           DateTime // Assuming start_time is a DateTime field
  endTime             DateTime // Assuming end_time is a DateTime field
  model               String   @default("")
+  api_base            String   @default("")
  user                String   @default("")
  metadata            Json     @default("{}")
  cache_hit           String   @default("")
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -64,6 +64,7 @@ class ProxyLogging:
        litellm.callbacks.append(self.max_parallel_request_limiter)
        litellm.callbacks.append(self.max_budget_limiter)
        litellm.callbacks.append(self.cache_control_check)
+        litellm.callbacks.append(self.response_taking_too_long_callback)
        for callback in litellm.callbacks:
            if callback not in litellm.input_callback:
                litellm.input_callback.append(callback)
@ -142,6 +143,30 @@ class ProxyLogging:
                raise e
        return data

+    async def response_taking_too_long_callback(
+        self,
+        kwargs,  # kwargs to completion
+        completion_response,  # response from completion
+        start_time,
+        end_time,  # start/end time
+    ):
+        if self.alerting is None:
+            return
+        time_difference = end_time - start_time
+        # Convert the timedelta to float (in seconds)
+        time_difference_float = time_difference.total_seconds()
+        litellm_params = kwargs.get("litellm_params", {})
+        api_base = litellm_params.get("api_base", "")
+        model = kwargs.get("model", "")
+        messages = kwargs.get("messages", "")
+        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
+        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
+        if time_difference_float > self.alerting_threshold:
+            await self.alerting_handler(
+                message=slow_message + request_info,
+                level="Low",
+            )
+
    async def response_taking_too_long(
        self,
        start_time: Optional[float] = None,
@ -189,16 +214,6 @@ class ProxyLogging:
                    level="Medium",
                )

-        elif (
-            type == "slow_response" and start_time is not None and end_time is not None
-        ):
-            slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
-            if end_time - start_time > self.alerting_threshold:
-                await self.alerting_handler(
-                    message=slow_message + request_info,
-                    level="Low",
-                )
-
    async def budget_alerts(
        self,
        type: Literal[
@ -1585,6 +1600,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
        "completion_tokens": usage.get("completion_tokens", 0),
        "request_tags": metadata.get("tags", []),
        "end_user": kwargs.get("user", ""),
+        "api_base": litellm_params.get("api_base", ""),
    }

    verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.28.8"
+version = "1.28.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.28.8"
+version = "1.28.9"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/schema.prisma
+++ b/schema.prisma
@ -11,6 +11,7 @@ generator client {
 model LiteLLM_BudgetTable {
  budget_id String @id @default(uuid())
  max_budget Float?
+  soft_budget Float?
  max_parallel_requests Int?
  tpm_limit     BigInt?
  rpm_limit     BigInt?
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
    allowed_cache_controls String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    budget_id String?
 }

 // store proxy config.yaml
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
  startTime           DateTime // Assuming start_time is a DateTime field
  endTime             DateTime // Assuming end_time is a DateTime field
  model               String   @default("")
+  api_base            String   @default("")
  user                String   @default("")
  metadata            Json     @default("{}")
  cache_hit           String   @default("")
--- a/ui/litellm-dashboard/src/components/create_key_button.tsx
+++ b/ui/litellm-dashboard/src/components/create_key_button.tsx
@ -2,7 +2,7 @@

 import React, { useState, useEffect, useRef } from "react";
 import { Button, TextInput, Grid, Col } from "@tremor/react";
-import { Card, Metric, Text } from "@tremor/react";
+import { Card, Metric, Text, Title, Subtitle } from "@tremor/react";
 import {
  Button as Button2,
  Modal,
@ -38,6 +38,7 @@ const CreateKey: React.FC<CreateKeyProps> = ({
  const [form] = Form.useForm();
  const [isModalVisible, setIsModalVisible] = useState(false);
  const [apiKey, setApiKey] = useState(null);
+  const [softBudget, setSoftBudget] = useState(null);
  const handleOk = () => {
    setIsModalVisible(false);
    form.resetFields();
@ -54,8 +55,11 @@ const CreateKey: React.FC<CreateKeyProps> = ({
      message.info("Making API Call");
      setIsModalVisible(true);
      const response = await keyCreateCall(accessToken, userID, formValues);
+
+      console.log("key create Response:", response);
      setData((prevData) => (prevData ? [...prevData, response] : [response])); // Check if prevData is null
      setApiKey(response["key"]);
+      setSoftBudget(response["soft_budget"]);
      message.success("API Key Created");
      form.resetFields();
      localStorage.removeItem("userData" + userID);
@ -108,6 +112,9 @@ const CreateKey: React.FC<CreateKeyProps> = ({
                  ))}
                </Select>
              </Form.Item>
+              <Form.Item label="Soft Budget (USD) Monthly" name="soft_budget" initialValue={50.00}>
+                <InputNumber step={0.01} precision={2} defaultValue={50.00} width={200} />
+              </Form.Item>
              <Form.Item label="Max Budget (USD)" name="max_budget">
                <InputNumber step={0.01} precision={2} width={200} />
              </Form.Item>
@ -154,13 +161,14 @@ const CreateKey: React.FC<CreateKeyProps> = ({
      </Modal>
      {apiKey && (
        <Modal
-          title="Save your key"
          visible={isModalVisible}
          onOk={handleOk}
          onCancel={handleCancel}
          footer={null}
        >
          <Grid numItems={1} className="gap-2 w-full">
+            <Card>
+              <Title>Save your Key</Title>
              <Col numColSpan={1}>
                <p>
                  Please save this secret key somewhere safe and accessible. For
@ -171,11 +179,20 @@ const CreateKey: React.FC<CreateKeyProps> = ({
              </Col>
              <Col numColSpan={1}>
                {apiKey != null ? (
+                  <div>
                    <Text>API Key: {apiKey}</Text>
+                    <Title className="mt-6">Budgets</Title>
+                      <Text>Soft Limit Budget: ${softBudget}</Text>
+                      <Button className="mt-3">
+                        Test Alert
+                      </Button>
+
+                  </div>
                ) : (
                  <Text>Key being created, this might take 30s</Text>
                )}
              </Col>
+            </Card>
          </Grid>
        </Modal>
      )}
--- a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
+++ b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
@ -105,7 +105,7 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({

  return (
    <div>
-      <Button size = "xs" onClick={showModal}>
+      <Button size = "xs" onClick={showModal} variant="secondary">
        View Spend Report
      </Button>
      <Modal