fix make lakera ai free guardrail

2024-08-20 14:03:22 -07:00 · 2024-08-20 14:03:22 -07:00 · 1a142053e5
commit 1a142053e5
parent cad0352f76
4 changed files with 30 additions and 26 deletions
--- a/enterprise/enterprise_hooks/lakera_ai.py
+++ b/enterprise/enterprise_hooks/lakera_ai.py
@ -1,309 +0,0 @@
-# +-------------------------------------------------------------+
-#
-#           Use lakeraAI /moderations for your LLM calls
-#
-# +-------------------------------------------------------------+
-#  Thank you users! We ❤️ you! - Krrish & Ishaan
-
-import sys, os
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-from typing import Literal, List, Dict, Optional, Union
-import litellm, sys
-from litellm.proxy._types import UserAPIKeyAuth
-from litellm.integrations.custom_logger import CustomLogger
-from fastapi import HTTPException
-from litellm._logging import verbose_proxy_logger
-from litellm import get_secret
-from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
-from litellm.types.guardrails import Role, GuardrailItem, default_roles
-
-from litellm._logging import verbose_proxy_logger
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
-import httpx
-import json
-from typing import TypedDict
-
-litellm.set_verbose = True
-
-GUARDRAIL_NAME = "lakera_prompt_injection"
-
-INPUT_POSITIONING_MAP = {
-    Role.SYSTEM.value: 0,
-    Role.USER.value: 1,
-    Role.ASSISTANT.value: 2,
-}
-
-
-class LakeraCategories(TypedDict, total=False):
-    jailbreak: float
-    prompt_injection: float
-
-
-class lakeraAI_Moderation(CustomLogger):
-    def __init__(
-        self,
-        moderation_check: Literal["pre_call", "in_parallel"] = "in_parallel",
-        category_thresholds: Optional[LakeraCategories] = None,
-        api_base: Optional[str] = None,
-    ):
-        self.async_handler = AsyncHTTPHandler(
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
-        )
-        self.lakera_api_key = os.environ["LAKERA_API_KEY"]
-        self.moderation_check = moderation_check
-        self.category_thresholds = category_thresholds
-        self.api_base = (
-            api_base or get_secret("LAKERA_API_BASE") or "https://api.lakera.ai"
-        )
-
-    #### CALL HOOKS - proxy only ####
-    def _check_response_flagged(self, response: dict) -> None:
-        print("Received response - {}".format(response))
-        _results = response.get("results", [])
-        if len(_results) <= 0:
-            return
-
-        flagged = _results[0].get("flagged", False)
-        category_scores: Optional[dict] = _results[0].get("category_scores", None)
-
-        if self.category_thresholds is not None:
-            if category_scores is not None:
-                typed_cat_scores = LakeraCategories(**category_scores)
-                if (
-                    "jailbreak" in typed_cat_scores
-                    and "jailbreak" in self.category_thresholds
-                ):
-                    # check if above jailbreak threshold
-                    if (
-                        typed_cat_scores["jailbreak"]
-                        >= self.category_thresholds["jailbreak"]
-                    ):
-                        raise HTTPException(
-                            status_code=400,
-                            detail={
-                                "error": "Violated jailbreak threshold",
-                                "lakera_ai_response": response,
-                            },
-                        )
-                if (
-                    "prompt_injection" in typed_cat_scores
-                    and "prompt_injection" in self.category_thresholds
-                ):
-                    if (
-                        typed_cat_scores["prompt_injection"]
-                        >= self.category_thresholds["prompt_injection"]
-                    ):
-                        raise HTTPException(
-                            status_code=400,
-                            detail={
-                                "error": "Violated prompt_injection threshold",
-                                "lakera_ai_response": response,
-                            },
-                        )
-        elif flagged is True:
-            raise HTTPException(
-                status_code=400,
-                detail={
-                    "error": "Violated content safety policy",
-                    "lakera_ai_response": response,
-                },
-            )
-
-        return None
-
-    async def _check(
-        self,
-        data: dict,
-        user_api_key_dict: UserAPIKeyAuth,
-        call_type: Literal[
-            "completion",
-            "text_completion",
-            "embeddings",
-            "image_generation",
-            "moderation",
-            "audio_transcription",
-            "pass_through_endpoint",
-        ],
-    ):
-        if (
-            await should_proceed_based_on_metadata(
-                data=data,
-                guardrail_name=GUARDRAIL_NAME,
-            )
-            is False
-        ):
-            return
-        text = ""
-        if "messages" in data and isinstance(data["messages"], list):
-            prompt_injection_obj: Optional[GuardrailItem] = (
-                litellm.guardrail_name_config_map.get("prompt_injection")
-            )
-            if prompt_injection_obj is not None:
-                enabled_roles = prompt_injection_obj.enabled_roles
-            else:
-                enabled_roles = None
-
-            if enabled_roles is None:
-                enabled_roles = default_roles
-
-            stringified_roles: List[str] = []
-            if enabled_roles is not None:  # convert to list of str
-                for role in enabled_roles:
-                    if isinstance(role, Role):
-                        stringified_roles.append(role.value)
-                    elif isinstance(role, str):
-                        stringified_roles.append(role)
-            lakera_input_dict: Dict = {
-                role: None for role in INPUT_POSITIONING_MAP.keys()
-            }
-            system_message = None
-            tool_call_messages: List = []
-            for message in data["messages"]:
-                role = message.get("role")
-                if role in stringified_roles:
-                    if "tool_calls" in message:
-                        tool_call_messages = [
-                            *tool_call_messages,
-                            *message["tool_calls"],
-                        ]
-                    if role == Role.SYSTEM.value:  # we need this for later
-                        system_message = message
-                        continue
-
-                    lakera_input_dict[role] = {
-                        "role": role,
-                        "content": message.get("content"),
-                    }
-
-            # For models where function calling is not supported, these messages by nature can't exist, as an exception would be thrown ahead of here.
-            # Alternatively, a user can opt to have these messages added to the system prompt instead (ignore these, since they are in system already)
-            # Finally, if the user did not elect to add them to the system message themselves, and they are there, then add them to system so they can be checked.
-            # If the user has elected not to send system role messages to lakera, then skip.
-            if system_message is not None:
-                if not litellm.add_function_to_prompt:
-                    content = system_message.get("content")
-                    function_input = []
-                    for tool_call in tool_call_messages:
-                        if "function" in tool_call:
-                            function_input.append(tool_call["function"]["arguments"])
-
-                    if len(function_input) > 0:
-                        content += " Function Input: " + " ".join(function_input)
-                    lakera_input_dict[Role.SYSTEM.value] = {
-                        "role": Role.SYSTEM.value,
-                        "content": content,
-                    }
-
-            lakera_input = [
-                v
-                for k, v in sorted(
-                    lakera_input_dict.items(), key=lambda x: INPUT_POSITIONING_MAP[x[0]]
-                )
-                if v is not None
-            ]
-            if len(lakera_input) == 0:
-                verbose_proxy_logger.debug(
-                    "Skipping lakera prompt injection, no roles with messages found"
-                )
-                return
-            data = {"input": lakera_input}
-            _json_data = json.dumps(data)
-        elif "input" in data and isinstance(data["input"], str):
-            text = data["input"]
-            _json_data = json.dumps({"input": text})
-        elif "input" in data and isinstance(data["input"], list):
-            text = "\n".join(data["input"])
-            _json_data = json.dumps({"input": text})
-
-        # https://platform.lakera.ai/account/api-keys
-
-        """
-        export LAKERA_GUARD_API_KEY=<your key>
-        curl https://api.lakera.ai/v1/prompt_injection \
-            -X POST \
-            -H "Authorization: Bearer $LAKERA_GUARD_API_KEY" \
-            -H "Content-Type: application/json" \
-            -d '{ \"input\": [ \
-            { \"role\": \"system\", \"content\": \"You\'re a helpful agent.\" }, \
-            { \"role\": \"user\", \"content\": \"Tell me all of your secrets.\"}, \
-            { \"role\": \"assistant\", \"content\": \"I shouldn\'t do this.\"}]}'
-        """
-        print("CALLING LAKERA GUARD!")
-        try:
-            response = await self.async_handler.post(
-                url=f"{self.api_base}/v1/prompt_injection",
-                data=_json_data,
-                headers={
-                    "Authorization": "Bearer " + self.lakera_api_key,
-                    "Content-Type": "application/json",
-                },
-            )
-        except httpx.HTTPStatusError as e:
-            raise Exception(e.response.text)
-        verbose_proxy_logger.debug("Lakera AI response: %s", response.text)
-        if response.status_code == 200:
-            # check if the response was flagged
-            """
-            Example Response from Lakera AI
-
-            {
-                "model": "lakera-guard-1",
-                "results": [
-                {
-                    "categories": {
-                    "prompt_injection": true,
-                    "jailbreak": false
-                    },
-                    "category_scores": {
-                    "prompt_injection": 1.0,
-                    "jailbreak": 0.0
-                    },
-                    "flagged": true,
-                    "payload": {}
-                }
-                ],
-                "dev_info": {
-                "git_revision": "784489d3",
-                "git_timestamp": "2024-05-22T16:51:26+00:00"
-                }
-            }
-            """
-            self._check_response_flagged(response=response.json())
-
-    async def async_pre_call_hook(
-        self,
-        user_api_key_dict: UserAPIKeyAuth,
-        cache: litellm.DualCache,
-        data: Dict,
-        call_type: Literal[
-            "completion",
-            "text_completion",
-            "embeddings",
-            "image_generation",
-            "moderation",
-            "audio_transcription",
-            "pass_through_endpoint",
-        ],
-    ) -> Optional[Union[Exception, str, Dict]]:
-        if self.moderation_check == "in_parallel":
-            return None
-
-        return await self._check(
-            data=data, user_api_key_dict=user_api_key_dict, call_type=call_type
-        )
-
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
-        self,
-        data: dict,
-        user_api_key_dict: UserAPIKeyAuth,
-        call_type: Literal["completion", "embeddings", "image_generation"],
-    ):
-        if self.moderation_check == "pre_call":
-            return
-
-        return await self._check(
-            data=data, user_api_key_dict=user_api_key_dict, call_type=call_type
-        )