diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md index e350ce9d5..70fd6e6a8 100644 --- a/docs/my-website/docs/proxy/virtual_keys.md +++ b/docs/my-website/docs/proxy/virtual_keys.md @@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \ "metadata": {"user": "ishaan@berri.ai"}, "team_id": "core-infra", "max_budget": 10, + "soft_budget": 5, }' ``` @@ -93,6 +94,7 @@ Request Params: - `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml - `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend - `max_budget`: *Optional[float]* - Specify max budget for a given key. +- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget - `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}` - `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x. - `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" } diff --git a/litellm/__init__.py b/litellm/__init__.py index cd639ddb9..f218fe036 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -79,6 +79,9 @@ max_budget: float = 0.0 # set the max budget across all providers budget_duration: Optional[str] = ( None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ) +default_soft_budget: float = ( + 50.0 # by default all litellm proxy keys have a soft budget of 50.0 +) _openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"] _openai_completion_params = [ "functions", diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index e981aef6d..ac30977b3 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -151,6 +151,7 @@ class GenerateRequestBase(LiteLLMBase): rpm_limit: Optional[int] = None budget_duration: Optional[str] = None allowed_cache_controls: Optional[list] = [] + soft_budget: Optional[float] = None class GenerateKeyRequest(GenerateRequestBase): @@ -327,7 +328,7 @@ class TeamRequest(LiteLLMBase): class LiteLLM_BudgetTable(LiteLLMBase): """Represents user-controllable params for a LiteLLM_BudgetTable record""" - + soft_budget: Optional[float] = None max_budget: Optional[float] = None max_parallel_requests: Optional[int] = None tpm_limit: Optional[int] = None @@ -366,7 +367,7 @@ class OrganizationRequest(LiteLLMBase): class BudgetRequest(LiteLLMBase): budgets: List[str] - + class KeyManagementSystem(enum.Enum): GOOGLE_KMS = "google_kms" AZURE_KEY_VAULT = "azure_key_vault" @@ -585,6 +586,7 @@ class LiteLLM_SpendLogs(LiteLLMBase): request_id: str api_key: str model: Optional[str] = "" + api_base: Optional[str] = "" call_type: str spend: Optional[float] = 0.0 total_tokens: Optional[int] = 0 diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index bd3d111a7..1e3d83142 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -791,6 +791,7 @@ async def user_api_key_auth( "/global/spend/keys", "/global/spend/models", "/global/predict/spend/logs", + "/health/services", ] # check if the current route startswith any of the allowed routes if ( @@ -1814,6 +1815,9 @@ async def generate_key_helper_fn( spend: float, key_max_budget: Optional[float] = None, # key_max_budget is used to Budget Per key key_budget_duration: Optional[str] = None, + key_soft_budget: Optional[ + float + ] = None, # key_soft_budget is used to Budget Per key max_budget: Optional[float] = None, # max_budget is used to Budget Per user budget_duration: Optional[str] = None, # max_budget is used to Budget Per user token: Optional[str] = None, @@ -1873,6 +1877,19 @@ async def generate_key_helper_fn( rpm_limit = rpm_limit allowed_cache_controls = allowed_cache_controls + # TODO: @ishaan-jaff: Migrate all budget tracking to use LiteLLM_BudgetTable + if prisma_client is not None: + # create the Budget Row for the LiteLLM Verification Token + budget_row = LiteLLM_BudgetTable( + soft_budget=key_soft_budget or litellm.default_soft_budget, + model_max_budget=model_max_budget or {}, + created_by=user_id, + updated_by=user_id, + ) + new_budget = prisma_client.jsonify_object(budget_row.json(exclude_none=True)) + _budget = await prisma_client.db.litellm_budgettable.create(data={**new_budget}) # type: ignore + _budget_id = getattr(_budget, "id", None) + try: # Create a new verification token (you may want to enhance this logic based on your needs) user_data = { @@ -1910,6 +1927,7 @@ async def generate_key_helper_fn( "allowed_cache_controls": allowed_cache_controls, "permissions": permissions_json, "model_max_budget": model_max_budget_json, + "budget_id": _budget_id, } if ( general_settings.get("allow_user_auth", False) == True @@ -1982,6 +2000,9 @@ async def generate_key_helper_fn( except Exception as e: traceback.print_exc() raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + + # Add budget related info in key_data - this ensures it's returned + key_data["soft_budget"] = key_soft_budget return key_data @@ -2142,14 +2163,6 @@ async def async_data_generator(response, user_api_key_dict): except Exception as e: yield f"data: {str(e)}\n\n" - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - # Streaming is done, yield the [DONE] chunk done_message = "[DONE]" yield f"data: {done_message}\n\n" @@ -2497,14 +2510,6 @@ async def completion( headers=custom_headers, ) - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - fastapi_response.headers["x-litellm-model-id"] = model_id return response except Exception as e: @@ -2703,14 +2708,6 @@ async def chat_completion( headers=custom_headers, ) - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - fastapi_response.headers["x-litellm-model-id"] = model_id ### CALL HOOKS ### - modify outgoing data @@ -2918,12 +2915,6 @@ async def embeddings( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: @@ -3069,12 +3060,6 @@ async def image_generation( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: @@ -3228,12 +3213,6 @@ async def moderations( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: @@ -3378,6 +3357,8 @@ async def generate_key_fn( # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users if "max_budget" in data_json: data_json["key_max_budget"] = data_json.pop("max_budget", None) + if "soft_budget" in data_json: + data_json["key_soft_budget"] = data_json.pop("soft_budget", None) if "budget_duration" in data_json: data_json["key_budget_duration"] = data_json.pop("budget_duration", None) @@ -6722,6 +6703,50 @@ async def test_endpoint(request: Request): return {"route": request.url.path} +@router.get( + "/health/services", + tags=["health"], + dependencies=[Depends(user_api_key_auth)], + include_in_schema=False, +) +async def health_services_endpoint( + user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), + service: Literal["slack_budget_alerts"] = fastapi.Query( + description="Specify the service being hit." + ), +): + """ + Hidden endpoint. + + Used by the UI to let user check if slack alerting is working as expected. + """ + global general_settings, proxy_logging_obj + + if service is None: + raise HTTPException( + status_code=400, detail={"error": "Service must be specified."} + ) + + if service not in ["slack_budget_alerts"]: + raise HTTPException( + status_code=400, + detail={ + "error": f"Service must be in list. Service={service}. List={['slack_budget_alerts']}" + }, + ) + + test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` my-secret-project \n`Expected Day of Error`: 28th March \n`Current Spend`: 100 \n`Projected Spend at end of month`: 1000 \n + """ + + if "slack" in general_settings.get("alerting", []): + await proxy_logging_obj.alerting_handler(message=test_message, level="Low") + else: + raise HTTPException( + status_code=422, + detail={"error": "No slack connection setup. Unable to test this."}, + ) + + @router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)]) async def health_endpoint( user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma index 2607cf2b0..93a9b3123 100644 --- a/litellm/proxy/schema.prisma +++ b/litellm/proxy/schema.prisma @@ -11,10 +11,11 @@ generator client { model LiteLLM_BudgetTable { budget_id String @id @default(uuid()) max_budget Float? + soft_budget Float? max_parallel_requests Int? tpm_limit BigInt? rpm_limit BigInt? - model_max_budget Json @default("{}") + model_max_budget Json? budget_duration String? budget_reset_at DateTime? created_at DateTime @default(now()) @map("created_at") @@ -107,6 +108,7 @@ model LiteLLM_VerificationToken { allowed_cache_controls String[] @default([]) model_spend Json @default("{}") model_max_budget Json @default("{}") + budget_id String? } // store proxy config.yaml @@ -127,6 +129,7 @@ model LiteLLM_SpendLogs { startTime DateTime // Assuming start_time is a DateTime field endTime DateTime // Assuming end_time is a DateTime field model String @default("") + api_base String @default("") user String @default("") metadata Json @default("{}") cache_hit String @default("") diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 1cc52401a..c67448c86 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -64,6 +64,7 @@ class ProxyLogging: litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.cache_control_check) + litellm.callbacks.append(self.response_taking_too_long_callback) for callback in litellm.callbacks: if callback not in litellm.input_callback: litellm.input_callback.append(callback) @@ -142,6 +143,30 @@ class ProxyLogging: raise e return data + async def response_taking_too_long_callback( + self, + kwargs, # kwargs to completion + completion_response, # response from completion + start_time, + end_time, # start/end time + ): + if self.alerting is None: + return + time_difference = end_time - start_time + # Convert the timedelta to float (in seconds) + time_difference_float = time_difference.total_seconds() + litellm_params = kwargs.get("litellm_params", {}) + api_base = litellm_params.get("api_base", "") + model = kwargs.get("model", "") + messages = kwargs.get("messages", "") + request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`" + slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" + if time_difference_float > self.alerting_threshold: + await self.alerting_handler( + message=slow_message + request_info, + level="Low", + ) + async def response_taking_too_long( self, start_time: Optional[float] = None, @@ -189,16 +214,6 @@ class ProxyLogging: level="Medium", ) - elif ( - type == "slow_response" and start_time is not None and end_time is not None - ): - slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" - if end_time - start_time > self.alerting_threshold: - await self.alerting_handler( - message=slow_message + request_info, - level="Low", - ) - async def budget_alerts( self, type: Literal[ @@ -1585,6 +1600,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): "completion_tokens": usage.get("completion_tokens", 0), "request_tags": metadata.get("tags", []), "end_user": kwargs.get("user", ""), + "api_base": litellm_params.get("api_base", ""), } verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n") diff --git a/pyproject.toml b/pyproject.toml index 0dbe465c3..65e8645fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.28.8" +version = "1.28.9" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.28.8" +version = "1.28.9" version_files = [ "pyproject.toml:^version" ] diff --git a/schema.prisma b/schema.prisma index e7932d634..93a9b3123 100644 --- a/schema.prisma +++ b/schema.prisma @@ -11,6 +11,7 @@ generator client { model LiteLLM_BudgetTable { budget_id String @id @default(uuid()) max_budget Float? + soft_budget Float? max_parallel_requests Int? tpm_limit BigInt? rpm_limit BigInt? @@ -107,6 +108,7 @@ model LiteLLM_VerificationToken { allowed_cache_controls String[] @default([]) model_spend Json @default("{}") model_max_budget Json @default("{}") + budget_id String? } // store proxy config.yaml @@ -127,6 +129,7 @@ model LiteLLM_SpendLogs { startTime DateTime // Assuming start_time is a DateTime field endTime DateTime // Assuming end_time is a DateTime field model String @default("") + api_base String @default("") user String @default("") metadata Json @default("{}") cache_hit String @default("") diff --git a/ui/litellm-dashboard/src/components/create_key_button.tsx b/ui/litellm-dashboard/src/components/create_key_button.tsx index 3dddaf8b7..e76e2d0c2 100644 --- a/ui/litellm-dashboard/src/components/create_key_button.tsx +++ b/ui/litellm-dashboard/src/components/create_key_button.tsx @@ -2,7 +2,7 @@ import React, { useState, useEffect, useRef } from "react"; import { Button, TextInput, Grid, Col } from "@tremor/react"; -import { Card, Metric, Text } from "@tremor/react"; +import { Card, Metric, Text, Title, Subtitle } from "@tremor/react"; import { Button as Button2, Modal, @@ -38,6 +38,7 @@ const CreateKey: React.FC = ({ const [form] = Form.useForm(); const [isModalVisible, setIsModalVisible] = useState(false); const [apiKey, setApiKey] = useState(null); + const [softBudget, setSoftBudget] = useState(null); const handleOk = () => { setIsModalVisible(false); form.resetFields(); @@ -54,8 +55,11 @@ const CreateKey: React.FC = ({ message.info("Making API Call"); setIsModalVisible(true); const response = await keyCreateCall(accessToken, userID, formValues); + + console.log("key create Response:", response); setData((prevData) => (prevData ? [...prevData, response] : [response])); // Check if prevData is null setApiKey(response["key"]); + setSoftBudget(response["soft_budget"]); message.success("API Key Created"); form.resetFields(); localStorage.removeItem("userData" + userID); @@ -108,6 +112,9 @@ const CreateKey: React.FC = ({ ))} + + + @@ -154,28 +161,38 @@ const CreateKey: React.FC = ({ {apiKey && ( - -

- Please save this secret key somewhere safe and accessible. For - security reasons, you will not be able to view it again{" "} - through your LiteLLM account. If you lose this secret key, you - will need to generate a new one. -

- - - {apiKey != null ? ( - API Key: {apiKey} - ) : ( - Key being created, this might take 30s - )} - + + Save your Key + +

+ Please save this secret key somewhere safe and accessible. For + security reasons, you will not be able to view it again{" "} + through your LiteLLM account. If you lose this secret key, you + will need to generate a new one. +

+ + + {apiKey != null ? ( +
+ API Key: {apiKey} + Budgets + Soft Limit Budget: ${softBudget} + + +
+ ) : ( + Key being created, this might take 30s + )} + +
)} diff --git a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx index 0788af209..f0916ec01 100644 --- a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx +++ b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx @@ -105,7 +105,7 @@ const ViewKeySpendReport: React.FC = ({ return (
-