Merge pull request #3844 from BerriAI/litellm_region_based_alerts

feat(slack_alerting.py): enable provider-region based alerting
This commit is contained in:
Krish Dholakia 2024-05-25 21:03:16 -07:00 committed by GitHub
commit 960fa8b326
10 changed files with 809 additions and 100 deletions

View file

@ -4,13 +4,13 @@ import dotenv, os, traceback
from litellm.proxy._types import UserAPIKeyAuth, CallInfo, AlertType
from litellm._logging import verbose_logger, verbose_proxy_logger
import litellm, threading
from typing import List, Literal, Any, Union, Optional, Dict
from typing import List, Literal, Any, Union, Optional, Dict, Set
from litellm.caching import DualCache
import asyncio, time
import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime
from pydantic import BaseModel
from pydantic import BaseModel, Field
from enum import Enum
from datetime import datetime as dt, timedelta, timezone
from litellm.integrations.custom_logger import CustomLogger
@ -20,17 +20,25 @@ from typing import TypedDict
from openai import APIError
import litellm.types
import litellm.types.router
from litellm.types.router import LiteLLM_Params
class OutageModel(TypedDict):
model_id: str
class BaseOutageModel(TypedDict):
alerts: List[int]
deployment_ids: List[str]
minor_alert_sent: bool
major_alert_sent: bool
last_updated_at: float
class OutageModel(BaseOutageModel):
model_id: str
class ProviderRegionOutageModel(BaseOutageModel):
provider_region_id: str
deployment_ids: Set[str]
# we use this for the email header, please send a test email if you change this. verify it looks good on email
LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
EMAIL_LOGO_URL = os.getenv(
@ -52,17 +60,55 @@ class LiteLLMBase(BaseModel):
return self.dict()
class SlackAlertingArgsEnum(Enum):
daily_report_frequency: int = 12 * 60 * 60
report_check_interval: int = 5 * 60
budget_alert_ttl: int = 24 * 60 * 60
outage_alert_ttl: int = 1 * 60
region_outage_alert_ttl: int = 1 * 60
minor_outage_alert_threshold: int = 1 * 5
major_outage_alert_threshold: int = 1 * 10
max_outage_alert_list_size: int = 1 * 10
class SlackAlertingArgs(LiteLLMBase):
default_daily_report_frequency: int = 12 * 60 * 60 # 12 hours
daily_report_frequency: int = int(
os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency)
daily_report_frequency: int = Field(
default=int(
os.getenv(
"SLACK_DAILY_REPORT_FREQUENCY",
SlackAlertingArgsEnum.daily_report_frequency.value,
)
),
description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.",
)
report_check_interval: int = 5 * 60 # 5 minutes
budget_alert_ttl: int = 24 * 60 * 60 # 24 hours
outage_alert_ttl: int = 1 * 60 # 1 minute ttl
minor_outage_alert_threshold: int = 5
major_outage_alert_threshold: int = 10
max_outage_alert_list_size: int = 10 # prevent memory leak
report_check_interval: int = Field(
default=SlackAlertingArgsEnum.report_check_interval.value,
description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.",
) # 5 minutes
budget_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.budget_alert_ttl.value,
description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.",
) # 24 hours
outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.outage_alert_ttl.value,
description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
region_outage_alert_ttl: int = Field(
default=SlackAlertingArgsEnum.region_outage_alert_ttl.value,
description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
) # 1 minute ttl
minor_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value,
description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).",
)
major_outage_alert_threshold: int = Field(
default=SlackAlertingArgsEnum.major_outage_alert_threshold.value,
description="The number of errors that countas a model/region major outage. ('400' error code is not counted).",
)
max_outage_alert_list_size: int = Field(
default=SlackAlertingArgsEnum.max_outage_alert_list_size.value,
description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.",
) # prevent memory leak
class DeploymentMetrics(LiteLLMBase):
@ -736,6 +782,163 @@ class SlackAlerting(CustomLogger):
return error_msg
def _outage_alert_msg_factory(
self,
alert_type: Literal["Major", "Minor"],
key: Literal["Model", "Region"],
key_val: str,
provider: str,
api_base: Optional[str],
outage_value: BaseOutageModel,
) -> str:
"""Format an alert message for slack"""
headers = {f"{key} Name": key_val, "Provider": provider}
if api_base is not None:
headers["API Base"] = api_base # type: ignore
headers_str = "\n"
for k, v in headers.items():
headers_str += f"*{k}:* `{v}`\n"
return f"""\n\n
* {alert_type} Service Outage*
{headers_str}
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
async def region_outage_alerts(
self,
exception: APIError,
deployment_id: str,
) -> None:
"""
Send slack alert if specific provider region is having an outage.
Track for 408 (Timeout) and >=500 Error codes
"""
## CREATE (PROVIDER+REGION) ID ##
if self.llm_router is None:
return
deployment = self.llm_router.get_deployment(model_id=deployment_id)
if deployment is None:
return
model = deployment.litellm_params.model
### GET PROVIDER ###
provider = deployment.litellm_params.custom_llm_provider
if provider is None:
model, provider, _, _ = litellm.get_llm_provider(model=model)
### GET REGION ###
region_name = deployment.litellm_params.region_name
if region_name is None:
region_name = litellm.utils._get_model_region(
custom_llm_provider=provider, litellm_params=deployment.litellm_params
)
if region_name is None:
return
### UNIQUE CACHE KEY ###
cache_key = provider + region_name
outage_value: Optional[ProviderRegionOutageModel] = (
await self.internal_usage_cache.async_get_cache(key=cache_key)
)
if (
getattr(exception, "status_code", None) is None
or (
exception.status_code != 408 # type: ignore
and exception.status_code < 500 # type: ignore
)
or self.llm_router is None
):
return
if outage_value is None:
_deployment_set = set()
_deployment_set.add(deployment_id)
outage_value = ProviderRegionOutageModel(
provider_region_id=cache_key,
alerts=[exception.status_code], # type: ignore
minor_alert_sent=False,
major_alert_sent=False,
last_updated_at=time.time(),
deployment_ids=_deployment_set,
)
## add to cache ##
await self.internal_usage_cache.async_set_cache(
key=cache_key,
value=outage_value,
ttl=self.alerting_args.region_outage_alert_ttl,
)
return
if len(outage_value["alerts"]) < self.alerting_args.max_outage_alert_list_size:
outage_value["alerts"].append(exception.status_code) # type: ignore
else: # prevent memory leaks
pass
_deployment_set = outage_value["deployment_ids"]
_deployment_set.add(deployment_id)
outage_value["deployment_ids"] = _deployment_set
outage_value["last_updated_at"] = time.time()
## MINOR OUTAGE ALERT SENT ##
if (
outage_value["minor_alert_sent"] == False
and len(outage_value["alerts"])
>= self.alerting_args.minor_outage_alert_threshold
and len(_deployment_set) > 1 # make sure it's not just 1 bad deployment
):
msg = self._outage_alert_msg_factory(
alert_type="Minor",
key="Region",
key_val=region_name,
api_base=None,
outage_value=outage_value,
provider=provider,
)
# send minor alert
await self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts"
)
# set to true
outage_value["minor_alert_sent"] = True
## MAJOR OUTAGE ALERT SENT ##
elif (
outage_value["major_alert_sent"] == False
and len(outage_value["alerts"])
>= self.alerting_args.major_outage_alert_threshold
and len(_deployment_set) > 1 # make sure it's not just 1 bad deployment
):
msg = self._outage_alert_msg_factory(
alert_type="Major",
key="Region",
key_val=region_name,
api_base=None,
outage_value=outage_value,
provider=provider,
)
# send minor alert
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
# set to true
outage_value["major_alert_sent"] = True
## update cache ##
await self.internal_usage_cache.async_set_cache(
key=cache_key, value=outage_value
)
async def outage_alerts(
self,
exception: APIError,
@ -787,7 +990,6 @@ class SlackAlerting(CustomLogger):
outage_value = OutageModel(
model_id=deployment_id,
alerts=[exception.status_code], # type: ignore
deployment_ids=[deployment_id],
minor_alert_sent=False,
major_alert_sent=False,
last_updated_at=time.time(),
@ -801,8 +1003,14 @@ class SlackAlerting(CustomLogger):
)
return
outage_value["alerts"].append(exception.status_code) # type: ignore
outage_value["deployment_ids"].append(deployment_id)
if (
len(outage_value["alerts"])
< self.alerting_args.max_outage_alert_list_size
):
outage_value["alerts"].append(exception.status_code) # type: ignore
else: # prevent memory leaks
pass
outage_value["last_updated_at"] = time.time()
## MINOR OUTAGE ALERT SENT ##
@ -811,25 +1019,18 @@ class SlackAlerting(CustomLogger):
and len(outage_value["alerts"])
>= self.alerting_args.minor_outage_alert_threshold
):
msg = f"""\n\n
* Minor Service Outage*
*Model Name:* `{model}`
*Provider:* `{provider}`
*API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
msg = self._outage_alert_msg_factory(
alert_type="Minor",
key="Model",
key_val=model,
api_base=api_base,
outage_value=outage_value,
provider=provider,
)
# send minor alert
_result_val = self.send_alert(
await self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts"
)
if _result_val is not None:
await _result_val
# set to true
outage_value["minor_alert_sent"] = True
elif (
@ -837,19 +1038,14 @@ class SlackAlerting(CustomLogger):
and len(outage_value["alerts"])
>= self.alerting_args.major_outage_alert_threshold
):
msg = f"""\n\n
* Major Service Outage*
*Model Name:* `{model}`
*Provider:* `{provider}`
*API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
msg = self._outage_alert_msg_factory(
alert_type="Major",
key="Model",
key_val=model,
api_base=api_base,
outage_value=outage_value,
provider=provider,
)
# send minor alert
await self.send_alert(
message=msg, level="High", alert_type="outage_alerts"
@ -1103,18 +1299,7 @@ Model Info:
self,
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
"daily_reports",
"spend_reports",
"new_model_added",
"cooldown_deployment",
"outage_alerts",
],
alert_type: Literal[AlertType],
user_info: Optional[WebhookEvent] = None,
**kwargs,
):
@ -1254,34 +1439,17 @@ Model Info:
except Exception as e:
verbose_logger.debug(f"Exception raises -{str(e)}")
if "outage_alerts" in self.alert_types and isinstance(
kwargs.get("exception", ""), APIError
):
_litellm_params = litellm.types.router.LiteLLM_Params(
model=kwargs.get("model", ""),
**kwargs.get("litellm_params", {}),
**kwargs.get("optional_params", {}),
)
_region_name = litellm.utils._get_model_region(
custom_llm_provider=kwargs.get("custom_llm_provider", ""),
litellm_params=_litellm_params,
)
# if region name not known, default to api base #
if _region_name is None:
_region_name = litellm.get_api_base(
model=kwargs.get("model", ""),
optional_params={
**kwargs.get("litellm_params", {}),
**kwargs.get("optional_params", {}),
},
if isinstance(kwargs.get("exception", ""), APIError):
if "outage_alerts" in self.alert_types:
await self.outage_alerts(
exception=kwargs["exception"],
deployment_id=model_id,
)
if _region_name is None:
_region_name = ""
await self.outage_alerts(
exception=kwargs["exception"],
deployment_id=model_id,
)
if "region_outage_alerts" in self.alert_types:
await self.region_outage_alerts(
exception=kwargs["exception"], deployment_id=model_id
)
except Exception as e:
pass

View file

@ -18,6 +18,7 @@ AlertType = Literal[
"cooldown_deployment",
"new_model_added",
"outage_alerts",
"region_outage_alerts",
]
@ -835,6 +836,7 @@ class ConfigList(LiteLLMBase):
field_description: str
field_value: Any
stored_in_db: Optional[bool]
field_default_value: Any
class ConfigGeneralSettings(LiteLLMBase):
@ -912,7 +914,9 @@ class ConfigGeneralSettings(LiteLLMBase):
None,
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
)
alerting_args: Optional[Dict] = Field(
None, description="Controllable params for slack alerting - e.g. ttl in cache."
)
alerting_threshold: Optional[int] = Field(
None,
description="sends alerts if requests hang for 5min+",

View file

@ -130,6 +130,7 @@ from litellm.proxy.auth.auth_checks import (
)
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from litellm.exceptions import RejectedRequestError
from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting
try:
from litellm._version import version
@ -3050,6 +3051,13 @@ class ProxyConfig:
"global_max_parallel_requests"
]
## ALERTING ARGS ##
if "alerting_args" in _general_settings:
general_settings["alerting_args"] = _general_settings["alerting_args"]
proxy_logging_obj.slack_alerting_instance.update_values(
alerting_args=general_settings["alerting_args"],
)
async def add_deployment(
self,
prisma_client: PrismaClient,
@ -8910,6 +8918,7 @@ async def budget_settings(
field_description=field_info.description or "",
field_value=db_budget_row_dict.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)
@ -9826,6 +9835,149 @@ async def model_settings():
return returned_list
#### ALERTING MANAGEMENT ENDPOINTS ####
@router.get(
"/alerting/settings",
description="Return the configurable alerting param, description, and current value",
tags=["alerting"],
dependencies=[Depends(user_api_key_auth)],
include_in_schema=False,
)
async def alerting_settings(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
global proxy_logging_obj, prisma_client
"""
Used by UI to generate 'alerting settings' page
{
field_name=field_name,
field_type=allowed_args[field_name]["type"], # string/int
field_description=field_info.description or "", # human-friendly description
field_value=general_settings.get(field_name, None), # example value
}
"""
if prisma_client is None:
raise HTTPException(
status_code=400,
detail={"error": CommonProxyErrors.db_not_connected_error.value},
)
if user_api_key_dict.user_role != "proxy_admin":
raise HTTPException(
status_code=400,
detail={
"error": "{}, your role={}".format(
CommonProxyErrors.not_allowed_access.value,
user_api_key_dict.user_role,
)
},
)
## get general settings from db
db_general_settings = await prisma_client.db.litellm_config.find_first(
where={"param_name": "general_settings"}
)
if db_general_settings is not None and db_general_settings.param_value is not None:
db_general_settings_dict = dict(db_general_settings.param_value)
alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {}) # type: ignore
else:
alerting_args_dict = {}
allowed_args = {
"daily_report_frequency": {"type": "Integer"},
"report_check_interval": {"type": "Integer"},
"budget_alert_ttl": {"type": "Integer"},
"outage_alert_ttl": {"type": "Integer"},
"region_outage_alert_ttl": {"type": "Integer"},
"minor_outage_alert_threshold": {"type": "Integer"},
"major_outage_alert_threshold": {"type": "Integer"},
"max_outage_alert_list_size": {"type": "Integer"},
}
_slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance
_slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump()
return_val = []
for field_name, field_info in SlackAlertingArgs.model_fields.items():
if field_name in allowed_args:
_stored_in_db: Optional[bool] = None
if field_name in alerting_args_dict:
_stored_in_db = True
else:
_stored_in_db = False
_response_obj = ConfigList(
field_name=field_name,
field_type=allowed_args[field_name]["type"],
field_description=field_info.description or "",
field_value=_slack_alerting_args_dict.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)
return return_val
# @router.post(
# "/alerting/update",
# description="Update the slack alerting settings. Persist value in db.",
# tags=["alerting"],
# dependencies=[Depends(user_api_key_auth)],
# include_in_schema=False,
# )
# async def alerting_update(
# data: SlackAlertingArgs,
# user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
# ):
# """Allows updating slack alerting values. Used by UI."""
# global prisma_client
# if prisma_client is None:
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.db_not_connected_error.value},
# )
# if user_api_key_dict.user_role != "proxy_admin":
# raise HTTPException(
# status_code=400,
# detail={"error": CommonProxyErrors.not_allowed_access.value},
# )
# ## get general settings from db
# db_general_settings = await prisma_client.db.litellm_config.find_first(
# where={"param_name": "general_settings"}
# )
# ### update value
# alerting_args_dict = {}
# if db_general_settings is None or db_general_settings.param_value is None:
# general_settings = {}
# alerting_args_dict = {}
# else:
# general_settings = dict(db_general_settings.param_value)
# _alerting_args_dict = general_settings.get("alerting_args", None)
# if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict):
# alerting_args_dict = _alerting_args_dict
# alerting_args_dict = data.model
# response = await prisma_client.db.litellm_config.upsert(
# where={"param_name": "general_settings"},
# data={
# "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)}, # type: ignore
# "update": {"param_value": json.dumps(general_settings)}, # type: ignore
# },
# )
# return response
#### EXPERIMENTAL QUEUING ####
async def _litellm_chat_completions_worker(data, user_api_key_dict):
"""
@ -10969,6 +11121,7 @@ async def get_config_list(
field_description=field_info.description or "",
field_value=general_settings.get(field_name, None),
stored_in_db=_stored_in_db,
field_default_value=field_info.default,
)
return_val.append(_response_obj)

View file

@ -100,7 +100,7 @@ class ProxyLogging:
"new_model_added",
"outage_alerts",
]
self.slack_alerting_instance = SlackAlerting(
self.slack_alerting_instance: SlackAlerting = SlackAlerting(
alerting_threshold=self.alerting_threshold,
alerting=self.alerting,
alert_types=self.alert_types,

View file

@ -576,7 +576,9 @@ async def test_outage_alerting_called(
slack_alerting.update_values(llm_router=router)
with patch.object(
slack_alerting, "outage_alerts", new=AsyncMock()
) as mock_send_alert:
) as mock_outage_alert, patch.object(
slack_alerting, "region_outage_alerts", new=AsyncMock()
) as mock_region_alert:
try:
await router.acompletion(
model=model,
@ -586,7 +588,8 @@ async def test_outage_alerting_called(
except Exception as e:
pass
mock_send_alert.assert_called_once()
mock_outage_alert.assert_called_once()
mock_region_alert.assert_called_once()
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
for _ in range(6):
@ -600,6 +603,112 @@ async def test_outage_alerting_called(
pass
await asyncio.sleep(3)
if error_code == 500 or error_code == 408:
assert (
mock_send_alert.assert_called_once()
) # only model alert. region alert should only trigger for 2+ models in same region
else:
mock_send_alert.assert_not_called()
@pytest.mark.parametrize(
"model, api_base, llm_provider, vertex_project, vertex_location",
[
("gpt-3.5-turbo", None, "openai", None, None),
(
"azure/gpt-3.5-turbo",
"https://openai-gpt-4-test-v-1.openai.azure.com",
"azure",
None,
None,
),
("gemini-pro", None, "vertex_ai", "hardy-device-38811", "us-central1"),
],
)
@pytest.mark.parametrize("error_code", [500, 408, 400])
@pytest.mark.asyncio
async def test_region_outage_alerting_called(
model, api_base, llm_provider, vertex_project, vertex_location, error_code
):
"""
If call fails, outage alert is called
If multiple calls fail, outage alert is sent
"""
slack_alerting = SlackAlerting(
alerting=["webhook"], alert_types=["region_outage_alerts"]
)
litellm.callbacks = [slack_alerting]
error_to_raise: Optional[APIError] = None
if error_code == 400:
print("RAISING 400 ERROR CODE")
error_to_raise = litellm.BadRequestError(
message="this is a bad request",
model=model,
llm_provider=llm_provider,
)
elif error_code == 408:
print("RAISING 408 ERROR CODE")
error_to_raise = litellm.Timeout(
message="A timeout occurred", model=model, llm_provider=llm_provider
)
elif error_code == 500:
print("RAISING 500 ERROR CODE")
error_to_raise = litellm.ServiceUnavailableError(
message="API is unavailable",
model=model,
llm_provider=llm_provider,
response=httpx.Response(
status_code=503,
request=httpx.Request(
method="completion",
url="https://github.com/BerriAI/litellm",
),
),
)
router = Router(
model_list=[
{
"model_name": model,
"litellm_params": {
"model": model,
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": api_base,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
},
"model_info": {"id": "1"},
},
{
"model_name": model,
"litellm_params": {
"model": model,
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": api_base,
"vertex_location": vertex_location,
"vertex_project": "vertex_project-2",
},
"model_info": {"id": "2"},
},
],
num_retries=0,
allowed_fails=100,
)
slack_alerting.update_values(llm_router=router)
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
for idx in range(6):
if idx % 2 == 0:
deployment_id = "1"
else:
deployment_id = "2"
await slack_alerting.region_outage_alerts(
exception=error_to_raise, deployment_id=deployment_id # type: ignore
)
if model == "gemini-pro" and (error_code == 500 or error_code == 408):
mock_send_alert.assert_called_once()
else:
mock_send_alert.assert_not_called()

View file

@ -0,0 +1,123 @@
/**
* UI for controlling slack alerting settings
*/
import React, { useState, useEffect } from "react";
import {
Table,
TableHead,
TableRow,
TableHeaderCell,
TableCell,
Button,
Icon,
Badge,
TableBody,
Text,
} from "@tremor/react";
import { InputNumber, message } from "antd";
import { alertingSettingsCall, updateConfigFieldSetting } from "../networking";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import DynamicForm from "./dynamic_form";
interface alertingSettingsItem {
field_name: string;
field_type: string;
field_value: any;
field_default_value: any;
field_description: string;
stored_in_db: boolean | null;
}
interface AlertingSettingsProps {
accessToken: string | null;
}
const AlertingSettings: React.FC<AlertingSettingsProps> = ({ accessToken }) => {
const [alertingSettings, setAlertingSettings] = useState<
alertingSettingsItem[]
>([]);
console.log("INSIDE ALERTING SETTINGS");
useEffect(() => {
// get values
if (!accessToken) {
return;
}
alertingSettingsCall(accessToken).then((data) => {
setAlertingSettings(data);
});
}, [accessToken]);
const handleInputChange = (fieldName: string, newValue: any) => {
// Update the value in the state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? { ...setting, field_value: newValue }
: setting
);
setAlertingSettings(updatedSettings);
};
const handleSubmit = (formValues: Record<string, any>) => {
if (!accessToken) {
return;
}
let fieldValue = formValues;
if (fieldValue == null || fieldValue == undefined) {
return;
}
const initialFormValues: Record<string, any> = {};
alertingSettings.forEach((setting) => {
initialFormValues[setting.field_name] = setting.field_value;
});
// Merge initialFormValues with actual formValues
const mergedFormValues = { ...formValues, ...initialFormValues };
try {
updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues);
// update value in state
message.success("Wait 10s for proxy to update.");
} catch (error) {
// do something
}
};
const handleResetField = (fieldName: string, idx: number) => {
if (!accessToken) {
return;
}
try {
// deleteConfigFieldSetting(accessToken, fieldName);
// update value in state
const updatedSettings = alertingSettings.map((setting) =>
setting.field_name === fieldName
? {
...setting,
stored_in_db: null,
field_value: setting.field_default_value,
}
: setting
);
console.log("INSIDE HANDLE RESET FIELD");
setAlertingSettings(updatedSettings);
} catch (error) {
// do something
console.log("ERROR OCCURRED!");
}
};
return (
<DynamicForm
alertingSettings={alertingSettings}
handleInputChange={handleInputChange}
handleResetField={handleResetField}
handleSubmit={handleSubmit}
/>
);
};
export default AlertingSettings;

View file

@ -0,0 +1,96 @@
import React from "react";
import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd";
import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react";
import Paragraph from "antd/es/typography/Paragraph";
interface AlertingSetting {
field_name: string;
field_description: string;
field_type: string;
field_value: any;
stored_in_db: boolean | null;
}
interface DynamicFormProps {
alertingSettings: AlertingSetting[];
handleInputChange: (fieldName: string, newValue: any) => void;
handleResetField: (fieldName: string, index: number) => void;
handleSubmit: (formValues: Record<string, any>) => void;
}
const DynamicForm: React.FC<DynamicFormProps> = ({
alertingSettings,
handleInputChange,
handleResetField,
handleSubmit,
}) => {
const [form] = Form.useForm();
const onFinish = () => {
const formData = form.getFieldsValue();
handleSubmit(formData);
};
return (
<Form form={form} onFinish={onFinish} labelAlign="left">
{alertingSettings.map((value, index) => (
<TableRow key={index}>
<TableCell>
<Text>{value.field_name}</Text>
<p
style={{
fontSize: "0.65rem",
color: "#808080",
fontStyle: "italic",
}}
className="mt-1"
>
{value.field_description}
</p>
</TableCell>
<Form.Item name={value.field_name}>
<TableCell>
{value.field_type === "Integer" ? (
<InputNumber
step={1}
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
) : (
<Input
value={value.field_value}
onChange={(e) => handleInputChange(value.field_name, e)}
/>
)}
</TableCell>
</Form.Item>
<TableCell>
{value.stored_in_db == true ? (
<Badge icon={CheckCircleIcon} className="text-white">
In DB
</Badge>
) : value.stored_in_db == false ? (
<Badge className="text-gray bg-white outline">In Config</Badge>
) : (
<Badge className="text-gray bg-white outline">Not Set</Badge>
)}
</TableCell>
<TableCell>
<Icon
icon={TrashIcon}
color="red"
onClick={() => handleResetField(value.field_name, index)}
>
Reset
</Icon>
</TableCell>
</TableRow>
))}
<div>
<Button2 htmlType="submit">Update Settings</Button2>
</div>
</Form>
);
};
export default DynamicForm;

View file

@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon
import AddFallbacks from "./add_fallbacks";
import openai from "openai";
import Paragraph from "antd/es/skeleton/Paragraph";
interface GeneralSettingsPageProps {
accessToken: string | null;
userRole: string | null;

View file

@ -207,6 +207,41 @@ export const budgetCreateCall = async (
throw error;
}
};
export const alertingSettingsCall = async (accessToken: String) => {
/**
* Get all configurable params for setting a model
*/
try {
let url = proxyBaseUrl
? `${proxyBaseUrl}/alerting/settings`
: `/alerting/settings`;
//message.info("Requesting model data");
const response = await fetch(url, {
method: "GET",
headers: {
Authorization: `Bearer ${accessToken}`,
"Content-Type": "application/json",
},
});
if (!response.ok) {
const errorData = await response.text();
message.error(errorData, 10);
throw new Error("Network response was not ok");
}
const data = await response.json();
//message.info("Received model data");
return data;
// Handle success - you might want to update some state or UI based on the created key
} catch (error) {
console.error("Failed to get callbacks:", error);
throw error;
}
};
export const keyCreateCall = async (
accessToken: string,
userID: string,
@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async (
}
};
export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => {
export const adminspendByProvider = async (
accessToken: String,
keyToken: String | null,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/spend/provider`
: `/global/spend/provider`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String
}
};
export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
export const adminGlobalActivity = async (
accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity`
: `/global/activity`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String
}
};
export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
export const adminGlobalActivityPerModel = async (
accessToken: String,
startTime: String | undefined,
endTime: String | undefined
) => {
try {
let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`;
let url = proxyBaseUrl
? `${proxyBaseUrl}/global/activity/model`
: `/global/activity/model`;
if (startTime && endTime) {
url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime
}
};
export const adminTopModelsCall = async (accessToken: String) => {
try {
let url = proxyBaseUrl

View file

@ -31,7 +31,7 @@ import {
} from "./networking";
import { Modal, Form, Input, Select, Button as Button2, message } from "antd";
import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
import AlertingSettings from "./alerting/alerting_settings";
interface SettingsPageProps {
accessToken: string | null;
userRole: string | null;
@ -117,6 +117,7 @@ const Settings: React.FC<SettingsPageProps> = ({
db_exceptions: "Database Exceptions (Read/Write)",
daily_reports: "Weekly/Monthly Spend Reports",
outage_alerts: "Outage Alerts",
region_outage_alerts: "Region Outage Alerts",
};
useEffect(() => {
@ -365,7 +366,8 @@ const Settings: React.FC<SettingsPageProps> = ({
<TabGroup>
<TabList variant="line" defaultValue="1">
<Tab value="1">Logging Callbacks</Tab>
<Tab value="2">Alerting</Tab>
<Tab value="2">Alerting Types</Tab>
<Tab value="2">Alerting Settings</Tab>
</TabList>
<TabPanels>
<TabPanel>
@ -496,6 +498,9 @@ const Settings: React.FC<SettingsPageProps> = ({
</Button>
</Card>
</TabPanel>
<TabPanel>
<AlertingSettings accessToken={accessToken} />
</TabPanel>
</TabPanels>
</TabGroup>
</Grid>