feat(ui): allow admin to configure slack alerting thresholds on ui

This commit is contained in:
Krrish Dholakia 2024-05-25 21:01:19 -07:00
parent c2f19d631e
commit a9a1447513
9 changed files with 492 additions and 24 deletions

View file

@ -10,7 +10,7 @@ import asyncio, time
import aiohttp import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime import datetime
from pydantic import BaseModel from pydantic import BaseModel, Field
from enum import Enum from enum import Enum
from datetime import datetime as dt, timedelta, timezone from datetime import datetime as dt, timedelta, timezone
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
@ -60,18 +60,55 @@ class LiteLLMBase(BaseModel):
return self.dict() return self.dict()
class SlackAlertingArgsEnum(Enum):
daily_report_frequency: int = 12 * 60 * 60
report_check_interval: int = 5 * 60
budget_alert_ttl: int = 24 * 60 * 60
outage_alert_ttl: int = 1 * 60
region_outage_alert_ttl: int = 1 * 60
minor_outage_alert_threshold: int = 1 * 5
major_outage_alert_threshold: int = 1 * 10
max_outage_alert_list_size: int = 1 * 10
class SlackAlertingArgs(LiteLLMBase): class SlackAlertingArgs(LiteLLMBase):
default_daily_report_frequency: int = 12 * 60 * 60 # 12 hours daily_report_frequency: int = Field(
daily_report_frequency: int = int( default=int(
os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency) os.getenv(
"SLACK_DAILY_REPORT_FREQUENCY",
SlackAlertingArgsEnum.daily_report_frequency.value,
)
),
description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.",
) )
report_check_interval: int = 5 * 60 # 5 minutes report_check_interval: int = Field(
budget_alert_ttl: int = 24 * 60 * 60 # 24 hours default=SlackAlertingArgsEnum.report_check_interval.value,
outage_alert_ttl: int = 1 * 60 # 1 minute ttl description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.",
region_outage_alert_ttl: int = 1 * 60 # 1 minute ttl ) # 5 minutes
minor_outage_alert_threshold: int = 5 budget_alert_ttl: int = Field(
major_outage_alert_threshold: int = 10 default=SlackAlertingArgsEnum.budget_alert_ttl.value,
max_outage_alert_list_size: int = 10 # prevent memory leak description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.",
) # 24 hours
outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.outage_alert_ttl.value,
description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
region_outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.region_outage_alert_ttl.value,
description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
minor_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value,
description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).",
)
major_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.major_outage_alert_threshold.value,
description="The number of errors that countas a model/region major outage. ('400' error code is not counted).",
)
max_outage_alert_list_size: int = Field(
default=SlackAlertingArgsEnum.max_outage_alert_list_size.value,
description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.",
) # prevent memory leak
class DeploymentMetrics(LiteLLMBase): class DeploymentMetrics(LiteLLMBase):

View file

@ -800,6 +800,7 @@ class ConfigList(LiteLLMBase):
field_description: str field_description: str
field_value: Any field_value: Any
stored_in_db: Optional[bool] stored_in_db: Optional[bool]
field_default_value: Any
class ConfigGeneralSettings(LiteLLMBase): class ConfigGeneralSettings(LiteLLMBase):
@ -877,7 +878,9 @@ class ConfigGeneralSettings(LiteLLMBase):
None, None,
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`", description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
) )
alerting_args: Optional[Dict] = Field(
None, description="Controllable params for slack alerting - e.g. ttl in cache."
)
alerting_threshold: Optional[int] = Field( alerting_threshold: Optional[int] = Field(
None, None,
description="sends alerts if requests hang for 5min+", description="sends alerts if requests hang for 5min+",

View file

@ -125,6 +125,7 @@ from litellm.proxy.auth.auth_checks import (
) )
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from litellm.exceptions import RejectedRequestError from litellm.exceptions import RejectedRequestError
from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting
try: try:
from litellm._version import version from litellm._version import version
@ -3049,6 +3050,13 @@ class ProxyConfig:
"global_max_parallel_requests" "global_max_parallel_requests"
] ]
## ALERTING ARGS ##
if "alerting_args" in _general_settings:
general_settings["alerting_args"] = _general_settings["alerting_args"]
proxy_logging_obj.slack_alerting_instance.update_values(
alerting_args=general_settings["alerting_args"],
)
async def add_deployment( async def add_deployment(
self, self,
prisma_client: PrismaClient, prisma_client: PrismaClient,
@ -8894,6 +8902,7 @@ async def budget_settings(
field_description=field_info.description or "", field_description=field_info.description or "",
field_value=db_budget_row_dict.get(field_name, None), field_value=db_budget_row_dict.get(field_name, None),
stored_in_db=_stored_in_db, stored_in_db=_stored_in_db,
field_default_value=field_info.default,
) )
return_val.append(_response_obj) return_val.append(_response_obj)
@ -9791,6 +9800,149 @@ async def model_settings():
return returned_list return returned_list
#### ALERTING MANAGEMENT ENDPOINTS ####
@router.get(
"/alerting/settings",
description="Return the configurable alerting param, description, and current value",
tags=["alerting"],
dependencies=[Depends(user_api_key_auth)],
include_in_schema=False,
)
async def alerting_settings(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
global proxy_logging_obj, prisma_client
"""
Used by UI to generate 'alerting settings' page
{
field_name=field_name,
field_type=allowed_args[field_name]["type"], # string/int
field_description=field_info.description or "", # human-friendly description
field_value=general_settings.get(field_name, None), # example value
}
"""
if prisma_client is None:
raise HTTPException(
status_code=400,
detail={"error": CommonProxyErrors.db_not_connected_error.value},
)
if user_api_key_dict.user_role != "proxy_admin":
raise HTTPException(
status_code=400,
detail={
"error": "{}, your role={}".format(
CommonProxyErrors.not_allowed_access.value,
user_api_key_dict.user_role,
)
},
)
## get general settings from db
db_general_settings = await prisma_client.db.litellm_config.find_first(
where={"param_name": "general_settings"}
)
if db_general_settings is not None and db_general_settings.param_value is not None:
db_general_settings_dict = dict(db_general_settings.param_value)
alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {}) # type: ignore
else:
alerting_args_dict = {}
allowed_args = {
"daily_report_frequency": {"type": "Integer"},
"report_check_interval": {"type": "Integer"},
"budget_alert_ttl": {"type": "Integer"},
"outage_alert_ttl": {"type": "Integer"},
"region_outage_alert_ttl": {"type": "Integer"},
"minor_outage_alert_threshold": {"type": "Integer"},
"major_outage_alert_threshold": {"type": "Integer"},
"max_outage_alert_list_size": {"type": "Integer"},
}
_slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance
_slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump()
return_val = []
for field_name, field_info in SlackAlertingArgs.model_fields.items():
if field_name in allowed_args:
_stored_in_db: Optional[bool] = None
if field_name in alerting_args_dict:
_stored_in_db = True
else:
_stored_in_db = False
_response_obj = ConfigList(
field_name=field_name,
field_type=allowed_args[field_name]["type"],
field_description=field_info.description or "",
field_value=_slack_alerting_args_dict.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)
return return_val
# @router.post(
# "/alerting/update",
# description="Update the slack alerting settings. Persist value in db.",
# tags=["alerting"],
# dependencies=[Depends(user_api_key_auth)],
# include_in_schema=False,
# )
# async def alerting_update(
# data: SlackAlertingArgs,
# user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
# ):
# """Allows updating slack alerting values. Used by UI."""
# global prisma_client
# if prisma_client is None:
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.db_not_connected_error.value},
# )
# if user_api_key_dict.user_role != "proxy_admin":
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.not_allowed_access.value},
# )
# ## get general settings from db
# db_general_settings = await prisma_client.db.litellm_config.find_first(
# where={"param_name": "general_settings"}
# )
# ### update value
# alerting_args_dict = {}
# if db_general_settings is None or db_general_settings.param_value is None:
# general_settings = {}
# alerting_args_dict = {}
# else:
# general_settings = dict(db_general_settings.param_value)
# _alerting_args_dict = general_settings.get("alerting_args", None)
# if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict):
# alerting_args_dict = _alerting_args_dict
# alerting_args_dict = data.model
# response = await prisma_client.db.litellm_config.upsert(
# where={"param_name": "general_settings"},
# data={
# "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)}, # type: ignore
# "update": {"param_value": json.dumps(general_settings)}, # type: ignore
# },
# )
# return response
#### EXPERIMENTAL QUEUING #### #### EXPERIMENTAL QUEUING ####
async def _litellm_chat_completions_worker(data, user_api_key_dict): async def _litellm_chat_completions_worker(data, user_api_key_dict):
""" """
@ -10934,6 +11086,7 @@ async def get_config_list(
field_description=field_info.description or "", field_description=field_info.description or "",
field_value=general_settings.get(field_name, None), field_value=general_settings.get(field_name, None),
stored_in_db=_stored_in_db, stored_in_db=_stored_in_db,
field_default_value=field_info.default,
) )
return_val.append(_response_obj) return_val.append(_response_obj)

View file

@ -92,7 +92,7 @@ class ProxyLogging:
"new_model_added", "new_model_added",
"outage_alerts", "outage_alerts",
] ]
self.slack_alerting_instance = SlackAlerting( self.slack_alerting_instance: SlackAlerting = SlackAlerting(
alerting_threshold=self.alerting_threshold, alerting_threshold=self.alerting_threshold,
alerting=self.alerting, alerting=self.alerting,
alert_types=self.alert_types, alert_types=self.alert_types,

View file

@ -0,0 +1,123 @@
/**
* UI for controlling slack alerting settings
*/
import React, { useState, useEffect } from "react";
import {
Table,
TableHead,
TableRow,
TableHeaderCell,
TableCell,
Button,
Icon,
Badge,
TableBody,
Text,
} from "@tremor/react";
import { InputNumber, message } from "antd";
import { alertingSettingsCall, updateConfigFieldSetting } from "../networking";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import DynamicForm from "./dynamic_form";
interface alertingSettingsItem {
field_name: string;
field_type: string;
field_value: any;
field_default_value: any;
field_description: string;
stored_in_db: boolean | null;
}
interface AlertingSettingsProps {
accessToken: string | null;
}
const AlertingSettings: React.FC<AlertingSettingsProps> = ({ accessToken }) => {
const [alertingSettings, setAlertingSettings] = useState<
alertingSettingsItem[]
>([]);
console.log("INSIDE ALERTING SETTINGS");
useEffect(() => {
// get values
if (!accessToken) {
return;
}
alertingSettingsCall(accessToken).then((data) => {
setAlertingSettings(data);
});
}, [accessToken]);
const handleInputChange = (fieldName: string, newValue: any) => {
// Update the value in the state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? { ...setting, field_value: newValue }
: setting
);
setAlertingSettings(updatedSettings);
};
const handleSubmit = (formValues: Record<string, any>) => {
if (!accessToken) {
return;
}
let fieldValue = formValues;
if (fieldValue == null || fieldValue == undefined) {
return;
}
const initialFormValues: Record<string, any> = {};
alertingSettings.forEach((setting) => {
initialFormValues[setting.field_name] = setting.field_value;
});
// Merge initialFormValues with actual formValues
const mergedFormValues = { ...formValues, ...initialFormValues };
try {
updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues);
// update value in state
message.success("Wait 10s for proxy to update.");
} catch (error) {
// do something
}
};
const handleResetField = (fieldName: string, idx: number) => {
if (!accessToken) {
return;
}
try {
// deleteConfigFieldSetting(accessToken, fieldName);
// update value in state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? {
...setting,
stored_in_db: null,
field_value: setting.field_default_value,
}
: setting
);
console.log("INSIDE HANDLE RESET FIELD");
setAlertingSettings(updatedSettings);
} catch (error) {
// do something
console.log("ERROR OCCURRED!");
}
};
return (
<DynamicForm
alertingSettings={alertingSettings}
handleInputChange={handleInputChange}
handleResetField={handleResetField}
handleSubmit={handleSubmit}
/>
);
};
export default AlertingSettings;

View file

@ -0,0 +1,96 @@
import React from "react";
import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react";
import Paragraph from "antd/es/typography/Paragraph";
interface AlertingSetting {
field_name: string;
field_description: string;
field_type: string;
field_value: any;
stored_in_db: boolean | null;
}
interface DynamicFormProps {
alertingSettings: AlertingSetting[];
handleInputChange: (fieldName: string, newValue: any) => void;
handleResetField: (fieldName: string, index: number) => void;
handleSubmit: (formValues: Record<string, any>) => void;
}
const DynamicForm: React.FC<DynamicFormProps> = ({
alertingSettings,
handleInputChange,
handleResetField,
handleSubmit,
}) => {
const [form] = Form.useForm();
const onFinish = () => {
const formData = form.getFieldsValue();
handleSubmit(formData);
};
return (
<Form form={form} onFinish={onFinish} labelAlign="left">
{alertingSettings.map((value, index) => (
<TableRow key={index}>
<TableCell>
<Text>{value.field_name}</Text>
<p
style={{
fontSize: "0.65rem",
color: "#808080",
fontStyle: "italic",
}}
className="mt-1"
>
{value.field_description}
</p>
</TableCell>
<Form.Item name={value.field_name}>
<TableCell>
{value.field_type === "Integer" ? (
<InputNumber
step={1}
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
) : (
<Input
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
)}
</TableCell>
</Form.Item>
<TableCell>
{value.stored_in_db == true ? (
<Badge icon={CheckCircleIcon} className="text-white">
In DB
</Badge>
) : value.stored_in_db == false ? (
<Badge className="text-gray bg-white outline">In Config</Badge>
) : (
<Badge className="text-gray bg-white outline">Not Set</Badge>
)}
</TableCell>
<TableCell>
<Icon
icon={TrashIcon}
color="red"
onClick={() => handleResetField(value.field_name, index)}
>
Reset
</Icon>
</TableCell>
</TableRow>
))}
<div>
<Button2 htmlType="submit">Update Settings</Button2>
</div>
</Form>
);
};
export default DynamicForm;

View file

@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon
import AddFallbacks from "./add_fallbacks"; import AddFallbacks from "./add_fallbacks";
import openai from "openai"; import openai from "openai";
import Paragraph from "antd/es/skeleton/Paragraph"; import Paragraph from "antd/es/skeleton/Paragraph";
interface GeneralSettingsPageProps { interface GeneralSettingsPageProps {
accessToken: string | null; accessToken: string | null;
userRole: string | null; userRole: string | null;

View file

@ -207,6 +207,41 @@ export const budgetCreateCall = async (
throw error; throw error;
} }
}; };
export const alertingSettingsCall = async (accessToken: String) => {
/**
* Get all configurable params for setting a model
*/
try {
let url = proxyBaseUrl
? `${proxyBaseUrl}/alerting/settings`
: `/alerting/settings`;
//message.info("Requesting model data");
const response = await fetch(url, {
method: "GET",
headers: {
Authorization: `Bearer ${accessToken}`,
"Content-Type": "application/json",
},
});
if (!response.ok) {
const errorData = await response.text();
message.error(errorData, 10);
throw new Error("Network response was not ok");
}
const data = await response.json();
//message.info("Received model data");
return data;
// Handle success - you might want to update some state or UI based on the created key
} catch (error) {
console.error("Failed to get callbacks:", error);
throw error;
}
};
export const keyCreateCall = async ( export const keyCreateCall = async (
accessToken: string, accessToken: string,
userID: string, userID: string,
@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async (
} }
}; };
export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => { export const adminspendByProvider = async (
accessToken: String,
keyToken: String | null,
startTime: String | undefined,
endTime: String | undefined
) => {
try { try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`; let url = proxyBaseUrl
? `${proxyBaseUrl}/global/spend/provider`
: `/global/spend/provider`;
if (startTime && endTime) { if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`; url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String
} }
}; };
export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { export const adminGlobalActivity = async (
accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try { try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`; let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity`
: `/global/activity`;
if (startTime && endTime) { if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`; url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String
} }
}; };
export const adminGlobalActivityPerModel = async (
export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try { try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`; let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity/model`
: `/global/activity/model`;
if (startTime && endTime) { if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`; url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime
} }
}; };
export const adminTopModelsCall = async (accessToken: String) => { export const adminTopModelsCall = async (accessToken: String) => {
try { try {
let url = proxyBaseUrl let url = proxyBaseUrl

View file

@ -31,7 +31,7 @@ import {
} from "./networking"; } from "./networking";
import { Modal, Form, Input, Select, Button as Button2, message } from "antd"; import { Modal, Form, Input, Select, Button as Button2, message } from "antd";
import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider"; import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
import AlertingSettings from "./alerting/alerting_settings";
interface SettingsPageProps { interface SettingsPageProps {
accessToken: string | null; accessToken: string | null;
userRole: string | null; userRole: string | null;
@ -117,6 +117,7 @@ const Settings: React.FC<SettingsPageProps> = ({
db_exceptions: "Database Exceptions (Read/Write)", db_exceptions: "Database Exceptions (Read/Write)",
daily_reports: "Weekly/Monthly Spend Reports", daily_reports: "Weekly/Monthly Spend Reports",
outage_alerts: "Outage Alerts", outage_alerts: "Outage Alerts",
region_outage_alerts: "Region Outage Alerts",
}; };
useEffect(() => { useEffect(() => {
@ -365,7 +366,8 @@ const Settings: React.FC<SettingsPageProps> = ({
<TabGroup> <TabGroup>
<TabList variant="line" defaultValue="1"> <TabList variant="line" defaultValue="1">
<Tab value="1">Logging Callbacks</Tab> <Tab value="1">Logging Callbacks</Tab>
<Tab value="2">Alerting</Tab> <Tab value="2">Alerting Types</Tab>
<Tab value="2">Alerting Settings</Tab>
</TabList> </TabList>
<TabPanels> <TabPanels>
<TabPanel> <TabPanel>
@ -496,6 +498,9 @@ const Settings: React.FC<SettingsPageProps> = ({
</Button> </Button>
</Card> </Card>
</TabPanel> </TabPanel>
<TabPanel>
<AlertingSettings accessToken={accessToken} />
</TabPanel>
</TabPanels> </TabPanels>
</TabGroup> </TabGroup>
</Grid> </Grid>