From a9a14475132192060e07a87c6c8abd88d5816882 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 25 May 2024 21:01:19 -0700 Subject: [PATCH] feat(ui): allow admin to configure slack alerting thresholds on ui --- litellm/integrations/slack_alerting.py | 59 +++++-- litellm/proxy/_types.py | 5 +- litellm/proxy/proxy_server.py | 153 ++++++++++++++++++ litellm/proxy/utils.py | 2 +- .../components/alerting/alerting_settings.tsx | 123 ++++++++++++++ .../src/components/alerting/dynamic_form.tsx | 96 +++++++++++ .../src/components/general_settings.tsx | 1 - .../src/components/networking.tsx | 68 +++++++- .../src/components/settings.tsx | 9 +- 9 files changed, 492 insertions(+), 24 deletions(-) create mode 100644 ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx create mode 100644 ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 4e89d4111..9e35b4fc3 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -10,7 +10,7 @@ import asyncio, time import aiohttp from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler import datetime -from pydantic import BaseModel +from pydantic import BaseModel, Field from enum import Enum from datetime import datetime as dt, timedelta, timezone from litellm.integrations.custom_logger import CustomLogger @@ -60,18 +60,55 @@ class LiteLLMBase(BaseModel): return self.dict() +class SlackAlertingArgsEnum(Enum): + daily_report_frequency: int = 12 * 60 * 60 + report_check_interval: int = 5 * 60 + budget_alert_ttl: int = 24 * 60 * 60 + outage_alert_ttl: int = 1 * 60 + region_outage_alert_ttl: int = 1 * 60 + minor_outage_alert_threshold: int = 1 * 5 + major_outage_alert_threshold: int = 1 * 10 + max_outage_alert_list_size: int = 1 * 10 + + class SlackAlertingArgs(LiteLLMBase): - default_daily_report_frequency: int = 12 * 60 * 60 # 12 hours - daily_report_frequency: int = int( - os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency) + daily_report_frequency: int = Field( + default=int( + os.getenv( + "SLACK_DAILY_REPORT_FREQUENCY", + SlackAlertingArgsEnum.daily_report_frequency.value, + ) + ), + description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.", ) - report_check_interval: int = 5 * 60 # 5 minutes - budget_alert_ttl: int = 24 * 60 * 60 # 24 hours - outage_alert_ttl: int = 1 * 60 # 1 minute ttl - region_outage_alert_ttl: int = 1 * 60 # 1 minute ttl - minor_outage_alert_threshold: int = 5 - major_outage_alert_threshold: int = 10 - max_outage_alert_list_size: int = 10 # prevent memory leak + report_check_interval: int = Field( + default=SlackAlertingArgsEnum.report_check_interval.value, + description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.", + ) # 5 minutes + budget_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.budget_alert_ttl.value, + description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.", + ) # 24 hours + outage_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.outage_alert_ttl.value, + description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.", + ) # 1 minute ttl + region_outage_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.region_outage_alert_ttl.value, + description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.", + ) # 1 minute ttl + minor_outage_alert_threshold: int = Field( + default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value, + description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).", + ) + major_outage_alert_threshold: int = Field( + default=SlackAlertingArgsEnum.major_outage_alert_threshold.value, + description="The number of errors that countas a model/region major outage. ('400' error code is not counted).", + ) + max_outage_alert_list_size: int = Field( + default=SlackAlertingArgsEnum.max_outage_alert_list_size.value, + description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.", + ) # prevent memory leak class DeploymentMetrics(LiteLLMBase): diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 55d354e8b..d4632cb89 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -800,6 +800,7 @@ class ConfigList(LiteLLMBase): field_description: str field_value: Any stored_in_db: Optional[bool] + field_default_value: Any class ConfigGeneralSettings(LiteLLMBase): @@ -877,7 +878,9 @@ class ConfigGeneralSettings(LiteLLMBase): None, description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`", ) - + alerting_args: Optional[Dict] = Field( + None, description="Controllable params for slack alerting - e.g. ttl in cache." + ) alerting_threshold: Optional[int] = Field( None, description="sends alerts if requests hang for 5min+", diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 904a414c2..aa434a166 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -125,6 +125,7 @@ from litellm.proxy.auth.auth_checks import ( ) from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.exceptions import RejectedRequestError +from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting try: from litellm._version import version @@ -3049,6 +3050,13 @@ class ProxyConfig: "global_max_parallel_requests" ] + ## ALERTING ARGS ## + if "alerting_args" in _general_settings: + general_settings["alerting_args"] = _general_settings["alerting_args"] + proxy_logging_obj.slack_alerting_instance.update_values( + alerting_args=general_settings["alerting_args"], + ) + async def add_deployment( self, prisma_client: PrismaClient, @@ -8894,6 +8902,7 @@ async def budget_settings( field_description=field_info.description or "", field_value=db_budget_row_dict.get(field_name, None), stored_in_db=_stored_in_db, + field_default_value=field_info.default, ) return_val.append(_response_obj) @@ -9791,6 +9800,149 @@ async def model_settings(): return returned_list +#### ALERTING MANAGEMENT ENDPOINTS #### + + +@router.get( + "/alerting/settings", + description="Return the configurable alerting param, description, and current value", + tags=["alerting"], + dependencies=[Depends(user_api_key_auth)], + include_in_schema=False, +) +async def alerting_settings( + user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +): + global proxy_logging_obj, prisma_client + """ + Used by UI to generate 'alerting settings' page + { + field_name=field_name, + field_type=allowed_args[field_name]["type"], # string/int + field_description=field_info.description or "", # human-friendly description + field_value=general_settings.get(field_name, None), # example value + } + """ + if prisma_client is None: + raise HTTPException( + status_code=400, + detail={"error": CommonProxyErrors.db_not_connected_error.value}, + ) + + if user_api_key_dict.user_role != "proxy_admin": + raise HTTPException( + status_code=400, + detail={ + "error": "{}, your role={}".format( + CommonProxyErrors.not_allowed_access.value, + user_api_key_dict.user_role, + ) + }, + ) + + ## get general settings from db + db_general_settings = await prisma_client.db.litellm_config.find_first( + where={"param_name": "general_settings"} + ) + + if db_general_settings is not None and db_general_settings.param_value is not None: + db_general_settings_dict = dict(db_general_settings.param_value) + alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {}) # type: ignore + else: + alerting_args_dict = {} + + allowed_args = { + "daily_report_frequency": {"type": "Integer"}, + "report_check_interval": {"type": "Integer"}, + "budget_alert_ttl": {"type": "Integer"}, + "outage_alert_ttl": {"type": "Integer"}, + "region_outage_alert_ttl": {"type": "Integer"}, + "minor_outage_alert_threshold": {"type": "Integer"}, + "major_outage_alert_threshold": {"type": "Integer"}, + "max_outage_alert_list_size": {"type": "Integer"}, + } + + _slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance + _slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump() + + return_val = [] + + for field_name, field_info in SlackAlertingArgs.model_fields.items(): + if field_name in allowed_args: + + _stored_in_db: Optional[bool] = None + if field_name in alerting_args_dict: + _stored_in_db = True + else: + _stored_in_db = False + + _response_obj = ConfigList( + field_name=field_name, + field_type=allowed_args[field_name]["type"], + field_description=field_info.description or "", + field_value=_slack_alerting_args_dict.get(field_name, None), + stored_in_db=_stored_in_db, + field_default_value=field_info.default, + ) + return_val.append(_response_obj) + return return_val + + +# @router.post( +# "/alerting/update", +# description="Update the slack alerting settings. Persist value in db.", +# tags=["alerting"], +# dependencies=[Depends(user_api_key_auth)], +# include_in_schema=False, +# ) +# async def alerting_update( +# data: SlackAlertingArgs, +# user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +# ): +# """Allows updating slack alerting values. Used by UI.""" +# global prisma_client +# if prisma_client is None: +# raise HTTPException( +# status_code=400, +# detail={"error": CommonProxyErrors.db_not_connected_error.value}, +# ) + +# if user_api_key_dict.user_role != "proxy_admin": +# raise HTTPException( +# status_code=400, +# detail={"error": CommonProxyErrors.not_allowed_access.value}, +# ) + +# ## get general settings from db +# db_general_settings = await prisma_client.db.litellm_config.find_first( +# where={"param_name": "general_settings"} +# ) +# ### update value + +# alerting_args_dict = {} +# if db_general_settings is None or db_general_settings.param_value is None: +# general_settings = {} +# alerting_args_dict = {} +# else: +# general_settings = dict(db_general_settings.param_value) +# _alerting_args_dict = general_settings.get("alerting_args", None) +# if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict): +# alerting_args_dict = _alerting_args_dict + + +# alerting_args_dict = data.model + +# response = await prisma_client.db.litellm_config.upsert( +# where={"param_name": "general_settings"}, +# data={ +# "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)}, # type: ignore +# "update": {"param_value": json.dumps(general_settings)}, # type: ignore +# }, +# ) + +# return response + + #### EXPERIMENTAL QUEUING #### async def _litellm_chat_completions_worker(data, user_api_key_dict): """ @@ -10934,6 +11086,7 @@ async def get_config_list( field_description=field_info.description or "", field_value=general_settings.get(field_name, None), stored_in_db=_stored_in_db, + field_default_value=field_info.default, ) return_val.append(_response_obj) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index b710165cb..a61bf3255 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -92,7 +92,7 @@ class ProxyLogging: "new_model_added", "outage_alerts", ] - self.slack_alerting_instance = SlackAlerting( + self.slack_alerting_instance: SlackAlerting = SlackAlerting( alerting_threshold=self.alerting_threshold, alerting=self.alerting, alert_types=self.alert_types, diff --git a/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx new file mode 100644 index 000000000..2e3476656 --- /dev/null +++ b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx @@ -0,0 +1,123 @@ +/** + * UI for controlling slack alerting settings + */ +import React, { useState, useEffect } from "react"; +import { + Table, + TableHead, + TableRow, + TableHeaderCell, + TableCell, + Button, + Icon, + Badge, + TableBody, + Text, +} from "@tremor/react"; +import { InputNumber, message } from "antd"; +import { alertingSettingsCall, updateConfigFieldSetting } from "../networking"; +import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline"; +import DynamicForm from "./dynamic_form"; +interface alertingSettingsItem { + field_name: string; + field_type: string; + field_value: any; + field_default_value: any; + field_description: string; + stored_in_db: boolean | null; +} + +interface AlertingSettingsProps { + accessToken: string | null; +} + +const AlertingSettings: React.FC = ({ accessToken }) => { + const [alertingSettings, setAlertingSettings] = useState< + alertingSettingsItem[] + >([]); + + console.log("INSIDE ALERTING SETTINGS"); + useEffect(() => { + // get values + if (!accessToken) { + return; + } + alertingSettingsCall(accessToken).then((data) => { + setAlertingSettings(data); + }); + }, [accessToken]); + + const handleInputChange = (fieldName: string, newValue: any) => { + // Update the value in the state + const updatedSettings = alertingSettings.map((setting) => + setting.field_name === fieldName + ? { ...setting, field_value: newValue } + : setting + ); + setAlertingSettings(updatedSettings); + }; + + const handleSubmit = (formValues: Record) => { + if (!accessToken) { + return; + } + + let fieldValue = formValues; + + if (fieldValue == null || fieldValue == undefined) { + return; + } + + const initialFormValues: Record = {}; + alertingSettings.forEach((setting) => { + initialFormValues[setting.field_name] = setting.field_value; + }); + + // Merge initialFormValues with actual formValues + const mergedFormValues = { ...formValues, ...initialFormValues }; + try { + updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues); + // update value in state + message.success("Wait 10s for proxy to update."); + } catch (error) { + // do something + } + }; + + const handleResetField = (fieldName: string, idx: number) => { + if (!accessToken) { + return; + } + + try { + // deleteConfigFieldSetting(accessToken, fieldName); + // update value in state + + const updatedSettings = alertingSettings.map((setting) => + setting.field_name === fieldName + ? { + ...setting, + stored_in_db: null, + field_value: setting.field_default_value, + } + : setting + ); + console.log("INSIDE HANDLE RESET FIELD"); + setAlertingSettings(updatedSettings); + } catch (error) { + // do something + console.log("ERROR OCCURRED!"); + } + }; + + return ( + + ); +}; + +export default AlertingSettings; diff --git a/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx new file mode 100644 index 000000000..c271712c9 --- /dev/null +++ b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx @@ -0,0 +1,96 @@ +import React from "react"; +import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd"; +import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline"; +import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react"; +import Paragraph from "antd/es/typography/Paragraph"; +interface AlertingSetting { + field_name: string; + field_description: string; + field_type: string; + field_value: any; + stored_in_db: boolean | null; +} + +interface DynamicFormProps { + alertingSettings: AlertingSetting[]; + handleInputChange: (fieldName: string, newValue: any) => void; + handleResetField: (fieldName: string, index: number) => void; + handleSubmit: (formValues: Record) => void; +} + +const DynamicForm: React.FC = ({ + alertingSettings, + handleInputChange, + handleResetField, + handleSubmit, +}) => { + const [form] = Form.useForm(); + + const onFinish = () => { + const formData = form.getFieldsValue(); + handleSubmit(formData); + }; + + return ( +
+ {alertingSettings.map((value, index) => ( + + + {value.field_name} +

+ {value.field_description} +

+
+ + + {value.field_type === "Integer" ? ( + handleInputChange(value.field_name, e)} + /> + ) : ( + handleInputChange(value.field_name, e)} + /> + )} + + + + {value.stored_in_db == true ? ( + + In DB + + ) : value.stored_in_db == false ? ( + In Config + ) : ( + Not Set + )} + + + handleResetField(value.field_name, index)} + > + Reset + + +
+ ))} +
+ Update Settings +
+
+ ); +}; + +export default DynamicForm; diff --git a/ui/litellm-dashboard/src/components/general_settings.tsx b/ui/litellm-dashboard/src/components/general_settings.tsx index d16b434b8..62e37f175 100644 --- a/ui/litellm-dashboard/src/components/general_settings.tsx +++ b/ui/litellm-dashboard/src/components/general_settings.tsx @@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon import AddFallbacks from "./add_fallbacks"; import openai from "openai"; import Paragraph from "antd/es/skeleton/Paragraph"; - interface GeneralSettingsPageProps { accessToken: string | null; userRole: string | null; diff --git a/ui/litellm-dashboard/src/components/networking.tsx b/ui/litellm-dashboard/src/components/networking.tsx index e7678ba10..28964a4a8 100644 --- a/ui/litellm-dashboard/src/components/networking.tsx +++ b/ui/litellm-dashboard/src/components/networking.tsx @@ -207,6 +207,41 @@ export const budgetCreateCall = async ( throw error; } }; + +export const alertingSettingsCall = async (accessToken: String) => { + /** + * Get all configurable params for setting a model + */ + try { + let url = proxyBaseUrl + ? `${proxyBaseUrl}/alerting/settings` + : `/alerting/settings`; + + //message.info("Requesting model data"); + const response = await fetch(url, { + method: "GET", + headers: { + Authorization: `Bearer ${accessToken}`, + "Content-Type": "application/json", + }, + }); + + if (!response.ok) { + const errorData = await response.text(); + message.error(errorData, 10); + throw new Error("Network response was not ok"); + } + + const data = await response.json(); + //message.info("Received model data"); + return data; + // Handle success - you might want to update some state or UI based on the created key + } catch (error) { + console.error("Failed to get callbacks:", error); + throw error; + } +}; + export const keyCreateCall = async ( accessToken: string, userID: string, @@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async ( } }; -export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => { +export const adminspendByProvider = async ( + accessToken: String, + keyToken: String | null, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/spend/provider` + : `/global/spend/provider`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String } }; -export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { +export const adminGlobalActivity = async ( + accessToken: String, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/activity` + : `/global/activity`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String } }; - -export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { +export const adminGlobalActivityPerModel = async ( + accessToken: String, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/activity/model` + : `/global/activity/model`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime } }; - export const adminTopModelsCall = async (accessToken: String) => { try { let url = proxyBaseUrl diff --git a/ui/litellm-dashboard/src/components/settings.tsx b/ui/litellm-dashboard/src/components/settings.tsx index 86d9cbfd3..db330905a 100644 --- a/ui/litellm-dashboard/src/components/settings.tsx +++ b/ui/litellm-dashboard/src/components/settings.tsx @@ -31,7 +31,7 @@ import { } from "./networking"; import { Modal, Form, Input, Select, Button as Button2, message } from "antd"; import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider"; - +import AlertingSettings from "./alerting/alerting_settings"; interface SettingsPageProps { accessToken: string | null; userRole: string | null; @@ -117,6 +117,7 @@ const Settings: React.FC = ({ db_exceptions: "Database Exceptions (Read/Write)", daily_reports: "Weekly/Monthly Spend Reports", outage_alerts: "Outage Alerts", + region_outage_alerts: "Region Outage Alerts", }; useEffect(() => { @@ -365,7 +366,8 @@ const Settings: React.FC = ({ Logging Callbacks - Alerting + Alerting Types + Alerting Settings @@ -496,6 +498,9 @@ const Settings: React.FC = ({ + + +