diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 869de6dde..17db8c3ab 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -2138,14 +2138,6 @@ async def async_data_generator(response, user_api_key_dict): except Exception as e: yield f"data: {str(e)}\n\n" - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - # Streaming is done, yield the [DONE] chunk done_message = "[DONE]" yield f"data: {done_message}\n\n" @@ -2494,14 +2486,6 @@ async def completion( headers=custom_headers, ) - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - fastapi_response.headers["x-litellm-model-id"] = model_id return response except Exception as e: @@ -2700,14 +2684,6 @@ async def chat_completion( headers=custom_headers, ) - ### ALERTING ### - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) - fastapi_response.headers["x-litellm-model-id"] = model_id ### CALL HOOKS ### - modify outgoing data @@ -2915,12 +2891,6 @@ async def embeddings( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: @@ -3066,12 +3036,6 @@ async def image_generation( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: @@ -3225,12 +3189,6 @@ async def moderations( ### ALERTING ### data["litellm_status"] = "success" # used for alerting - end_time = time.time() - asyncio.create_task( - proxy_logging_obj.response_taking_too_long( - start_time=start_time, end_time=end_time, type="slow_response" - ) - ) return response except Exception as e: diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 1cc52401a..948e686dd 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -64,6 +64,7 @@ class ProxyLogging: litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.cache_control_check) + litellm.callbacks.append(self.response_taking_too_long_callback) for callback in litellm.callbacks: if callback not in litellm.input_callback: litellm.input_callback.append(callback) @@ -142,6 +143,30 @@ class ProxyLogging: raise e return data + async def response_taking_too_long_callback( + self, + kwargs, # kwargs to completion + completion_response, # response from completion + start_time, + end_time, # start/end time + ): + if self.alerting is None: + return + time_difference = end_time - start_time + # Convert the timedelta to float (in seconds) + time_difference_float = time_difference.total_seconds() + litellm_params = kwargs.get("litellm_params", {}) + api_base = litellm_params.get("api_base", "") + model = kwargs.get("model", "") + messages = kwargs.get("messages", "") + request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`" + slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" + if time_difference_float > self.alerting_threshold: + await self.alerting_handler( + message=slow_message + request_info, + level="Low", + ) + async def response_taking_too_long( self, start_time: Optional[float] = None, @@ -189,16 +214,6 @@ class ProxyLogging: level="Medium", ) - elif ( - type == "slow_response" and start_time is not None and end_time is not None - ): - slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" - if end_time - start_time > self.alerting_threshold: - await self.alerting_handler( - message=slow_message + request_info, - level="Low", - ) - async def budget_alerts( self, type: Literal[