fix(llm_guard.py): add streaming hook for moderation calls

2024-02-20 20:31:32 -08:00 · 2024-02-20 20:31:32 -08:00 · 49847347d0
commit 49847347d0
parent 0a5b8f0e4e
4 changed files with 36 additions and 25 deletions
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -101,19 +101,16 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
        - Use the sanitized prompt returned
            - LLM Guard can handle things like PII Masking, etc.
        """
-        if "messages" in data:
-            safety_check_messages = data["messages"][
-                -1
-            ]  # get the last response - llama guard has a 4k token limit
-            if (
-                isinstance(safety_check_messages, dict)
-                and "content" in safety_check_messages
-                and isinstance(safety_check_messages["content"], str)
-            ):
-                await self.moderation_check(safety_check_messages["content"])
-
        return data

+    async def async_post_call_streaming_hook(
+        self, user_api_key_dict: UserAPIKeyAuth, response: str
+    ):
+        if response is not None:
+            await self.moderation_check(text=response)
+
+        return response
+

 # llm_guard = _ENTERPRISE_LLMGuard()

--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -75,6 +75,13 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
    async def async_moderation_hook(self, data: dict):
        pass

+    async def async_post_call_streaming_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: str,
+    ):
+        pass
+
    #### SINGLE-USE #### - https://docs.litellm.ai/docs/observability/custom_callback#using-your-custom-callback-function

    def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -401,6 +401,27 @@ class ProxyLogging:
                raise e
        return new_response

+    async def post_call_streaming_hook(
+        self,
+        response: str,
+        user_api_key_dict: UserAPIKeyAuth,
+    ):
+        """
+        - Check outgoing streaming response uptil that point
+        - Run through moderation check
+        - Reject request if it fails moderation check
+        """
+        new_response = copy.deepcopy(response)
+        for callback in litellm.callbacks:
+            try:
+                if isinstance(callback, CustomLogger):
+                    await callback.async_post_call_streaming_hook(
+                        user_api_key_dict=user_api_key_dict, response=new_response
+                    )
+            except Exception as e:
+                raise e
+        return new_response
+

 ### DB CONNECTOR ###
 # Define the retry decorator with backoff strategy
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -909,20 +909,6 @@ class Logging:
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                    )

-            if litellm.max_budget and self.stream:
-                start_time = self.start_time
-                end_time = (
-                    self.start_time
-                )  # no time has passed as the call hasn't been made yet
-                time_diff = (end_time - start_time).total_seconds()
-                float_diff = float(time_diff)
-                litellm._current_cost += litellm.completion_cost(
-                    model=self.model,
-                    prompt="".join(message["content"] for message in self.messages),
-                    completion="",
-                    total_time=float_diff,
-                )
-
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
            callbacks = litellm.input_callback + self.dynamic_input_callbacks
            for callback in callbacks: