diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 003597eaa..351112474 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -10,13 +10,16 @@ model_list: # api_key: my-fake-key # api_base: https://exampleopenaiendpoint-production.up.railway.app/ -# litellm_settings: -# cache: true -# max_budget: 600020 -# budget_duration: 30d +litellm_settings: + drop_params: True + max_budget: 800021 + budget_duration: 30d + # cache: true + general_settings: master_key: sk-1234 + alerting: ["slack"] # proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds) # enable_jwt_auth: True # alerting: ["slack"] diff --git a/litellm/proxy/hooks/tpm_rpm_limiter.py b/litellm/proxy/hooks/tpm_rpm_limiter.py index a46337491..8951991d2 100644 --- a/litellm/proxy/hooks/tpm_rpm_limiter.py +++ b/litellm/proxy/hooks/tpm_rpm_limiter.py @@ -22,13 +22,11 @@ class _PROXY_MaxTPMRPMLimiter(CustomLogger): user_api_key_cache = None # Class variables or attributes - def __init__(self, redis_usage_cache: Optional[RedisCache]): - self.redis_usage_cache = redis_usage_cache - self.internal_cache = DualCache( - redis_cache=redis_usage_cache, - default_in_memory_ttl=10, - default_redis_ttl=60, - ) + def __init__(self, internal_cache: Optional[DualCache]): + if internal_cache is None: + self.internal_cache = DualCache() + else: + self.internal_cache = internal_cache def print_verbose(self, print_statement): try: diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 8b6fae40f..0f2738dfb 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -302,9 +302,7 @@ disable_spend_logs = False jwt_handler = JWTHandler() prompt_injection_detection_obj: Optional[_OPTIONAL_PromptInjectionDetection] = None ### INITIALIZE GLOBAL LOGGING OBJECT ### -proxy_logging_obj = ProxyLogging( - user_api_key_cache=user_api_key_cache, redis_usage_cache=redis_usage_cache -) +proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache) ### REDIS QUEUE ### async_result = None celery_app_conn = None @@ -2251,6 +2249,7 @@ class ProxyConfig: proxy_logging_obj.update_values( alerting=general_settings.get("alerting", None), alerting_threshold=general_settings.get("alerting_threshold", 600), + redis_cache=redis_usage_cache, ) ### CONNECT TO DATABASE ### database_url = general_settings.get("database_url", None) @@ -4976,31 +4975,13 @@ async def global_spend(): if prisma_client is None: raise HTTPException(status_code=500, detail={"error": "No db connected"}) - sql_query = f""" - SELECT SUM(spend) AS total_spend - FROM "LiteLLM_VerificationToken"; - ; - """ + sql_query = """SELECT SUM(spend) as total_spend FROM "MonthlyGlobalSpend";""" response = await prisma_client.db.query_raw(query=sql_query) if response is not None: if isinstance(response, list) and len(response) > 0: total_spend = response[0].get("total_spend", 0.0) - sql_query = f""" - SELECT - * - FROM - "LiteLLM_UserTable" - WHERE - user_id = 'litellm-proxy-budget'; - """ - user_response = await prisma_client.db.query_raw(query=sql_query) - - if user_response is not None: - if isinstance(user_response, list) and len(user_response) > 0: - total_proxy_budget = user_response[0].get("max_budget", 0.0) - - return {"spend": total_spend, "max_budget": total_proxy_budget} + return {"spend": total_spend, "max_budget": litellm.max_budget} @router.get( diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 04fec29ba..708f77aa8 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -50,28 +50,33 @@ class ProxyLogging: def __init__( self, user_api_key_cache: DualCache, - redis_usage_cache: Optional[RedisCache] = None, ): ## INITIALIZE LITELLM CALLBACKS ## self.call_details: dict = {} self.call_details["user_api_key_cache"] = user_api_key_cache + self.internal_usage_cache = DualCache() self.max_parallel_request_limiter = _PROXY_MaxParallelRequestsHandler() self.max_tpm_rpm_limiter = _PROXY_MaxTPMRPMLimiter( - redis_usage_cache=redis_usage_cache + internal_cache=self.internal_usage_cache ) self.max_budget_limiter = _PROXY_MaxBudgetLimiter() self.cache_control_check = _PROXY_CacheControlCheck() self.alerting: Optional[List] = None self.alerting_threshold: float = 300 # default to 5 min. threshold - self.internal_usage_cache = DualCache(redis_cache=redis_usage_cache) def update_values( - self, alerting: Optional[List], alerting_threshold: Optional[float] + self, + alerting: Optional[List], + alerting_threshold: Optional[float], + redis_cache: Optional[RedisCache], ): self.alerting = alerting if alerting_threshold is not None: self.alerting_threshold = alerting_threshold + if redis_cache is not None: + self.internal_usage_cache.redis_cache = redis_cache + def _init_litellm_callbacks(self): print_verbose(f"INITIALIZING LITELLM CALLBACKS!") litellm.callbacks.append(self.max_parallel_request_limiter) @@ -265,10 +270,11 @@ class ProxyLogging: if self.alerting is None: # do nothing if alerting is not switched on return - + _id: str = "default_id" # used for caching if type == "user_and_proxy_budget": user_info = dict(user_info) user_id = user_info["user_id"] + _id = user_id max_budget = user_info["max_budget"] spend = user_info["spend"] user_email = user_info["user_email"] @@ -276,12 +282,14 @@ class ProxyLogging: elif type == "token_budget": token_info = dict(user_info) token = token_info["token"] + _id = token spend = token_info["spend"] max_budget = token_info["max_budget"] user_id = token_info["user_id"] user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}""" elif type == "failed_tracking": user_id = str(user_info) + _id = user_id user_info = f"\nUser ID: {user_id}\n Error {error_message}" message = "Failed Tracking Cost for" + user_info await self.alerting_handler( @@ -337,13 +345,15 @@ class ProxyLogging: # check if 5% of max budget is left if percent_left <= 0.05: message = "5% budget left for" + user_info - result = await _cache.async_get_cache(key=message) + cache_key = "alerting:{}".format(_id) + result = await _cache.async_get_cache(key=cache_key) if result is None: await self.alerting_handler( message=message, level="Medium", ) - await _cache.async_set_cache(key=message, value="SENT", ttl=2419200) + + await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200) return diff --git a/litellm/tests/test_max_tpm_rpm_limiter.py b/litellm/tests/test_max_tpm_rpm_limiter.py index 40a978c62..a906e2f8a 100644 --- a/litellm/tests/test_max_tpm_rpm_limiter.py +++ b/litellm/tests/test_max_tpm_rpm_limiter.py @@ -38,7 +38,7 @@ async def test_pre_call_hook_rpm_limits(): key=_api_key, value={"api_key": _api_key, "tpm_limit": 9, "rpm_limit": 1} ) - tpm_rpm_limiter = _PROXY_MaxTPMRPMLimiter(redis_usage_cache=None) + tpm_rpm_limiter = _PROXY_MaxTPMRPMLimiter(internal_cache=DualCache()) await tpm_rpm_limiter.async_pre_call_hook( user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type="" @@ -89,8 +89,8 @@ async def test_pre_call_hook_team_rpm_limits( user_api_key_dict = UserAPIKeyAuth(**_user_api_key_dict) # type: ignore local_cache = DualCache() local_cache.set_cache(key=_api_key, value=_user_api_key_dict) - tpm_rpm_limiter = _PROXY_MaxTPMRPMLimiter(redis_usage_cache=_redis_usage_cache) - + internal_cache = DualCache(redis_cache=_redis_usage_cache) + tpm_rpm_limiter = _PROXY_MaxTPMRPMLimiter(internal_cache=internal_cache) await tpm_rpm_limiter.async_pre_call_hook( user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type="" )