Merge pull request #2298 from BerriAI/litellm_show_api_base_in_response_taking_too_long

[FEAT] Slack Alerts show api base in response taking too long
2024-03-02 11:48:21 -08:00 · 2024-03-02 11:48:21 -08:00 · 0c8258c522
commit 0c8258c522
parent 468995b288 868a415aa0
2 changed files with 25 additions and 52 deletions
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -2138,14 +2138,6 @@ async def async_data_generator(response, user_api_key_dict):
            except Exception as e:
                yield f"data: {str(e)}\n\n"
        ### ALERTING ###
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        # Streaming is done, yield the [DONE] chunk
        done_message = "[DONE]"
        yield f"data: {done_message}\n\n"
@ -2494,14 +2486,6 @@ async def completion(
                headers=custom_headers,
            )
        ### ALERTING ###
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        fastapi_response.headers["x-litellm-model-id"] = model_id
        return response
    except Exception as e:
@ -2700,14 +2684,6 @@ async def chat_completion(
                headers=custom_headers,
            )
        ### ALERTING ###
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        fastapi_response.headers["x-litellm-model-id"] = model_id
        ### CALL HOOKS ### - modify outgoing data
@ -2915,12 +2891,6 @@ async def embeddings(
        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        return response
    except Exception as e:
@ -3066,12 +3036,6 @@ async def image_generation(
        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        return response
    except Exception as e:
@ -3225,12 +3189,6 @@ async def moderations(
        ### ALERTING ###
        data["litellm_status"] = "success"  # used for alerting
        end_time = time.time()
        asyncio.create_task(
            proxy_logging_obj.response_taking_too_long(
                start_time=start_time, end_time=end_time, type="slow_response"
            )
        )
        return response
    except Exception as e:
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -64,6 +64,7 @@ class ProxyLogging:
        litellm.callbacks.append(self.max_parallel_request_limiter)
        litellm.callbacks.append(self.max_budget_limiter)
        litellm.callbacks.append(self.cache_control_check)
        litellm.callbacks.append(self.response_taking_too_long_callback)
        for callback in litellm.callbacks:
            if callback not in litellm.input_callback:
                litellm.input_callback.append(callback)
@ -142,6 +143,30 @@ class ProxyLogging:
                raise e
        return data
    async def response_taking_too_long_callback(
        self,
        kwargs,  # kwargs to completion
        completion_response,  # response from completion
        start_time,
        end_time,  # start/end time
    ):
        if self.alerting is None:
            return
        time_difference = end_time - start_time
        # Convert the timedelta to float (in seconds)
        time_difference_float = time_difference.total_seconds()
        litellm_params = kwargs.get("litellm_params", {})
        api_base = litellm_params.get("api_base", "")
        model = kwargs.get("model", "")
        messages = kwargs.get("messages", "")
        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
        if time_difference_float > self.alerting_threshold:
            await self.alerting_handler(
                message=slow_message + request_info,
                level="Low",
            )
    async def response_taking_too_long(
        self,
        start_time: Optional[float] = None,
@ -189,16 +214,6 @@ class ProxyLogging:
                    level="Medium",
                )
        elif (
            type == "slow_response" and start_time is not None and end_time is not None
        ):
            slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
            if end_time - start_time > self.alerting_threshold:
                await self.alerting_handler(
                    message=slow_message + request_info,
                    level="Low",
                )
    async def budget_alerts(
        self,
        type: Literal[