feat(ui): allow admin to configure slack alerting thresholds on ui

This commit is contained in:
Krrish Dholakia 2024-05-25 21:01:19 -07:00
parent c2f19d631e
commit a9a1447513
9 changed files with 492 additions and 24 deletions

View file

@ -10,7 +10,7 @@ import asyncio, time
import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime
from pydantic import BaseModel
from pydantic import BaseModel, Field
from enum import Enum
from datetime import datetime as dt, timedelta, timezone
from litellm.integrations.custom_logger import CustomLogger
@ -60,18 +60,55 @@ class LiteLLMBase(BaseModel):
return self.dict()
class SlackAlertingArgsEnum(Enum):
daily_report_frequency: int = 12 * 60 * 60
report_check_interval: int = 5 * 60
budget_alert_ttl: int = 24 * 60 * 60
outage_alert_ttl: int = 1 * 60
region_outage_alert_ttl: int = 1 * 60
minor_outage_alert_threshold: int = 1 * 5
major_outage_alert_threshold: int = 1 * 10
max_outage_alert_list_size: int = 1 * 10
class SlackAlertingArgs(LiteLLMBase):
default_daily_report_frequency: int = 12 * 60 * 60 # 12 hours
daily_report_frequency: int = int(
os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency)
daily_report_frequency: int = Field(
default=int(
os.getenv(
"SLACK_DAILY_REPORT_FREQUENCY",
SlackAlertingArgsEnum.daily_report_frequency.value,
)
),
description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.",
)
report_check_interval: int = 5 * 60 # 5 minutes
budget_alert_ttl: int = 24 * 60 * 60 # 24 hours
outage_alert_ttl: int = 1 * 60 # 1 minute ttl
region_outage_alert_ttl: int = 1 * 60 # 1 minute ttl
minor_outage_alert_threshold: int = 5
major_outage_alert_threshold: int = 10
max_outage_alert_list_size: int = 10 # prevent memory leak
report_check_interval: int = Field(
default=SlackAlertingArgsEnum.report_check_interval.value,
description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.",
) # 5 minutes
budget_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.budget_alert_ttl.value,
description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.",
) # 24 hours
outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.outage_alert_ttl.value,
description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
region_outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.region_outage_alert_ttl.value,
description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
minor_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value,
description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).",
)
major_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.major_outage_alert_threshold.value,
description="The number of errors that countas a model/region major outage. ('400' error code is not counted).",
)
max_outage_alert_list_size: int = Field(
default=SlackAlertingArgsEnum.max_outage_alert_list_size.value,
description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.",
) # prevent memory leak
class DeploymentMetrics(LiteLLMBase):

View file

@ -800,6 +800,7 @@ class ConfigList(LiteLLMBase):
field_description: str
field_value: Any
stored_in_db: Optional[bool]
field_default_value: Any
class ConfigGeneralSettings(LiteLLMBase):
@ -877,7 +878,9 @@ class ConfigGeneralSettings(LiteLLMBase):
None,
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
)
alerting_args: Optional[Dict] = Field(
None, description="Controllable params for slack alerting - e.g. ttl in cache."
)
alerting_threshold: Optional[int] = Field(
None,
description="sends alerts if requests hang for 5min+",

View file

@ -125,6 +125,7 @@ from litellm.proxy.auth.auth_checks import (
)
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from litellm.exceptions import RejectedRequestError
from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting
try:
from litellm._version import version
@ -3049,6 +3050,13 @@ class ProxyConfig:
"global_max_parallel_requests"
]
## ALERTING ARGS ##
if "alerting_args" in _general_settings:
general_settings["alerting_args"] = _general_settings["alerting_args"]
proxy_logging_obj.slack_alerting_instance.update_values(
alerting_args=general_settings["alerting_args"],
)
async def add_deployment(
self,
prisma_client: PrismaClient,
@ -8894,6 +8902,7 @@ async def budget_settings(
field_description=field_info.description or "",
field_value=db_budget_row_dict.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)
@ -9791,6 +9800,149 @@ async def model_settings():
return returned_list
#### ALERTING MANAGEMENT ENDPOINTS ####
@router.get(
"/alerting/settings",
description="Return the configurable alerting param, description, and current value",
tags=["alerting"],
dependencies=[Depends(user_api_key_auth)],
include_in_schema=False,
)
async def alerting_settings(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
global proxy_logging_obj, prisma_client
"""
Used by UI to generate 'alerting settings' page
{
field_name=field_name,
field_type=allowed_args[field_name]["type"], # string/int
field_description=field_info.description or "", # human-friendly description
field_value=general_settings.get(field_name, None), # example value
}
"""
if prisma_client is None:
raise HTTPException(
status_code=400,
detail={"error": CommonProxyErrors.db_not_connected_error.value},
)
if user_api_key_dict.user_role != "proxy_admin":
raise HTTPException(
status_code=400,
detail={
"error": "{}, your role={}".format(
CommonProxyErrors.not_allowed_access.value,
user_api_key_dict.user_role,
)
},
)
## get general settings from db
db_general_settings = await prisma_client.db.litellm_config.find_first(
where={"param_name": "general_settings"}
)
if db_general_settings is not None and db_general_settings.param_value is not None:
db_general_settings_dict = dict(db_general_settings.param_value)
alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {}) # type: ignore
else:
alerting_args_dict = {}
allowed_args = {
"daily_report_frequency": {"type": "Integer"},
"report_check_interval": {"type": "Integer"},
"budget_alert_ttl": {"type": "Integer"},
"outage_alert_ttl": {"type": "Integer"},
"region_outage_alert_ttl": {"type": "Integer"},
"minor_outage_alert_threshold": {"type": "Integer"},
"major_outage_alert_threshold": {"type": "Integer"},
"max_outage_alert_list_size": {"type": "Integer"},
}
_slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance
_slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump()
return_val = []
for field_name, field_info in SlackAlertingArgs.model_fields.items():
if field_name in allowed_args:
_stored_in_db: Optional[bool] = None
if field_name in alerting_args_dict:
_stored_in_db = True
else:
_stored_in_db = False
_response_obj = ConfigList(
field_name=field_name,
field_type=allowed_args[field_name]["type"],
field_description=field_info.description or "",
field_value=_slack_alerting_args_dict.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)
return return_val
# @router.post(
# "/alerting/update",
# description="Update the slack alerting settings. Persist value in db.",
# tags=["alerting"],
# dependencies=[Depends(user_api_key_auth)],
# include_in_schema=False,
# )
# async def alerting_update(
# data: SlackAlertingArgs,
# user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
# ):
# """Allows updating slack alerting values. Used by UI."""
# global prisma_client
# if prisma_client is None:
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.db_not_connected_error.value},
# )
# if user_api_key_dict.user_role != "proxy_admin":
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.not_allowed_access.value},
# )
# ## get general settings from db
# db_general_settings = await prisma_client.db.litellm_config.find_first(
# where={"param_name": "general_settings"}
# )
# ### update value
# alerting_args_dict = {}
# if db_general_settings is None or db_general_settings.param_value is None:
# general_settings = {}
# alerting_args_dict = {}
# else:
# general_settings = dict(db_general_settings.param_value)
# _alerting_args_dict = general_settings.get("alerting_args", None)
# if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict):
# alerting_args_dict = _alerting_args_dict
# alerting_args_dict = data.model
# response = await prisma_client.db.litellm_config.upsert(
# where={"param_name": "general_settings"},
# data={
# "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)}, # type: ignore
# "update": {"param_value": json.dumps(general_settings)}, # type: ignore
# },
# )
# return response
#### EXPERIMENTAL QUEUING ####
async def _litellm_chat_completions_worker(data, user_api_key_dict):
"""
@ -10934,6 +11086,7 @@ async def get_config_list(
field_description=field_info.description or "",
field_value=general_settings.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)

View file

@ -92,7 +92,7 @@ class ProxyLogging:
"new_model_added",
"outage_alerts",
]
self.slack_alerting_instance = SlackAlerting(
self.slack_alerting_instance: SlackAlerting = SlackAlerting(
alerting_threshold=self.alerting_threshold,
alerting=self.alerting,
alert_types=self.alert_types,

View file

@ -0,0 +1,123 @@
/**
* UI for controlling slack alerting settings
*/
import React, { useState, useEffect } from "react";
import {
Table,
TableHead,
TableRow,
TableHeaderCell,
TableCell,
Button,
Icon,
Badge,
TableBody,
Text,
} from "@tremor/react";
import { InputNumber, message } from "antd";
import { alertingSettingsCall, updateConfigFieldSetting } from "../networking";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import DynamicForm from "./dynamic_form";
interface alertingSettingsItem {
field_name: string;
field_type: string;
field_value: any;
field_default_value: any;
field_description: string;
stored_in_db: boolean | null;
}
interface AlertingSettingsProps {
accessToken: string | null;
}
const AlertingSettings: React.FC<AlertingSettingsProps> = ({ accessToken }) => {
const [alertingSettings, setAlertingSettings] = useState<
alertingSettingsItem[]
>([]);
console.log("INSIDE ALERTING SETTINGS");
useEffect(() => {
// get values
if (!accessToken) {
return;
}
alertingSettingsCall(accessToken).then((data) => {
setAlertingSettings(data);
});
}, [accessToken]);
const handleInputChange = (fieldName: string, newValue: any) => {
// Update the value in the state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? { ...setting, field_value: newValue }
: setting
);
setAlertingSettings(updatedSettings);
};
const handleSubmit = (formValues: Record<string, any>) => {
if (!accessToken) {
return;
}
let fieldValue = formValues;
if (fieldValue == null || fieldValue == undefined) {
return;
}
const initialFormValues: Record<string, any> = {};
alertingSettings.forEach((setting) => {
initialFormValues[setting.field_name] = setting.field_value;
});
// Merge initialFormValues with actual formValues
const mergedFormValues = { ...formValues, ...initialFormValues };
try {
updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues);
// update value in state
message.success("Wait 10s for proxy to update.");
} catch (error) {
// do something
}
};
const handleResetField = (fieldName: string, idx: number) => {
if (!accessToken) {
return;
}
try {
// deleteConfigFieldSetting(accessToken, fieldName);
// update value in state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? {
...setting,
stored_in_db: null,
field_value: setting.field_default_value,
}
: setting
);
console.log("INSIDE HANDLE RESET FIELD");
setAlertingSettings(updatedSettings);
} catch (error) {
// do something
console.log("ERROR OCCURRED!");
}
};
return (
<DynamicForm
alertingSettings={alertingSettings}
handleInputChange={handleInputChange}
handleResetField={handleResetField}
handleSubmit={handleSubmit}
/>
);
};
export default AlertingSettings;

View file

@ -0,0 +1,96 @@
import React from "react";
import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react";
import Paragraph from "antd/es/typography/Paragraph";
interface AlertingSetting {
field_name: string;
field_description: string;
field_type: string;
field_value: any;
stored_in_db: boolean | null;
}
interface DynamicFormProps {
alertingSettings: AlertingSetting[];
handleInputChange: (fieldName: string, newValue: any) => void;
handleResetField: (fieldName: string, index: number) => void;
handleSubmit: (formValues: Record<string, any>) => void;
}
const DynamicForm: React.FC<DynamicFormProps> = ({
alertingSettings,
handleInputChange,
handleResetField,
handleSubmit,
}) => {
const [form] = Form.useForm();
const onFinish = () => {
const formData = form.getFieldsValue();
handleSubmit(formData);
};
return (
<Form form={form} onFinish={onFinish} labelAlign="left">
{alertingSettings.map((value, index) => (
<TableRow key={index}>
<TableCell>
<Text>{value.field_name}</Text>
<p
style={{
fontSize: "0.65rem",
color: "#808080",
fontStyle: "italic",
}}
className="mt-1"
>
{value.field_description}
</p>
</TableCell>
<Form.Item name={value.field_name}>
<TableCell>
{value.field_type === "Integer" ? (
<InputNumber
step={1}
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
) : (
<Input
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
)}
</TableCell>
</Form.Item>
<TableCell>
{value.stored_in_db == true ? (
<Badge icon={CheckCircleIcon} className="text-white">
In DB
</Badge>
) : value.stored_in_db == false ? (
<Badge className="text-gray bg-white outline">In Config</Badge>
) : (
<Badge className="text-gray bg-white outline">Not Set</Badge>
)}
</TableCell>
<TableCell>
<Icon
icon={TrashIcon}
color="red"
onClick={() => handleResetField(value.field_name, index)}
>
Reset
</Icon>
</TableCell>
</TableRow>
))}
<div>
<Button2 htmlType="submit">Update Settings</Button2>
</div>
</Form>
);
};
export default DynamicForm;

View file

@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon
import AddFallbacks from "./add_fallbacks";
import openai from "openai";
import Paragraph from "antd/es/skeleton/Paragraph";
interface GeneralSettingsPageProps {
accessToken: string | null;
userRole: string | null;

View file

@ -207,6 +207,41 @@ export const budgetCreateCall = async (
throw error;
}
};
export const alertingSettingsCall = async (accessToken: String) => {
/**
* Get all configurable params for setting a model
*/
try {
let url = proxyBaseUrl
? `${proxyBaseUrl}/alerting/settings`
: `/alerting/settings`;
//message.info("Requesting model data");
const response = await fetch(url, {
method: "GET",
headers: {
Authorization: `Bearer ${accessToken}`,
"Content-Type": "application/json",
},
});
if (!response.ok) {
const errorData = await response.text();
message.error(errorData, 10);
throw new Error("Network response was not ok");
}
const data = await response.json();
//message.info("Received model data");
return data;
// Handle success - you might want to update some state or UI based on the created key
} catch (error) {
console.error("Failed to get callbacks:", error);
throw error;
}
};
export const keyCreateCall = async (
accessToken: string,
userID: string,
@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async (
}
};
export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => {
export const adminspendByProvider = async (
accessToken: String,
keyToken: String | null,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/spend/provider`
: `/global/spend/provider`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String
}
};
export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
export const adminGlobalActivity = async (
accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity`
: `/global/activity`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String
}
};
export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
export const adminGlobalActivityPerModel = async (
accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity/model`
: `/global/activity/model`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime
}
};
export const adminTopModelsCall = async (accessToken: String) => {
try {
let url = proxyBaseUrl

View file

@ -31,7 +31,7 @@ import {
} from "./networking";
import { Modal, Form, Input, Select, Button as Button2, message } from "antd";
import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
import AlertingSettings from "./alerting/alerting_settings";
interface SettingsPageProps {
accessToken: string | null;
userRole: string | null;
@ -117,6 +117,7 @@ const Settings: React.FC<SettingsPageProps> = ({
db_exceptions: "Database Exceptions (Read/Write)",
daily_reports: "Weekly/Monthly Spend Reports",
outage_alerts: "Outage Alerts",
region_outage_alerts: "Region Outage Alerts",
};
useEffect(() => {
@ -365,7 +366,8 @@ const Settings: React.FC<SettingsPageProps> = ({
<TabGroup>
<TabList variant="line" defaultValue="1">
<Tab value="1">Logging Callbacks</Tab>
<Tab value="2">Alerting</Tab>
<Tab value="2">Alerting Types</Tab>
<Tab value="2">Alerting Settings</Tab>
</TabList>
<TabPanels>
<TabPanel>
@ -496,6 +498,9 @@ const Settings: React.FC<SettingsPageProps> = ({
</Button>
</Card>
</TabPanel>
<TabPanel>
<AlertingSettings accessToken={accessToken} />
</TabPanel>
</TabPanels>
</TabGroup>
</Grid>