Merge pull request #2408 from BerriAI/litellm_no_store_reqs

[FEAT-liteLLM Proxy] Incognito Requests - Don't log anything
2024-03-08 21:11:43 -08:00 · 2024-03-08 21:11:43 -08:00 · 8036b48f14
commit 8036b48f14
parent 9b94b0e591 5850ff470f
6 changed files with 89 additions and 20 deletions
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::
 Features: 
- [ ] Content Moderation with LlamaGuard 
+- ✅ Content Moderation with LlamaGuard 
- [ ] Content Moderation with Google Text Moderations 
+- ✅ Content Moderation with Google Text Moderations 
- [ ] Content Moderation with LLM Guard
+- ✅ Content Moderation with LLM Guard
- [ ] Reject calls from Blocked User list 
+- ✅ Reject calls from Blocked User list 
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
+- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- [ ] Tracking Spend for Custom Tags
+- ✅ Don't log/store specific requests (eg confidential LLM requests)
 - ✅ Tracking Spend for Custom Tags
-## Content Moderation with LlamaGuard 
+## Content Moderation
 ### Content Moderation with LlamaGuard 
 Currently works with Sagemaker's LlamaGuard endpoint. 
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 ```
-### Customize LlamaGuard prompt 
+#### Customize LlamaGuard prompt 
 To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
@ -51,7 +53,7 @@ callbacks: ["llamaguard_moderations"]
  llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
 ```
-## Content Moderation with LLM Guard
+### Content Moderation with LLM Guard
 Set the LLM Guard API Base in your environment 
@ -78,7 +80,7 @@ Expected results:
 LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
 ```
-## Content Moderation with Google Text Moderation 
+### Content Moderation with Google Text Moderation 
 Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
@ -89,7 +91,7 @@ litellm_settings:
   callbacks: ["google_text_moderation"]
 ```
-### Set custom confidence thresholds
+#### Set custom confidence thresholds
 Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
@ -133,6 +135,33 @@ Here are the category specific values:
 | "legal" | legal_threshold: 0.1 |
 ## Incognito Requests - Don't log anything
 When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",            # proxy api-key
    base_url="http://0.0.0.0:8000" # litellm proxy 
 )
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={
        "no-log": True
    }
 )
 print(response)
 ```
 ## Enable Blocked User Lists 
 If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features 
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -31,6 +31,18 @@ def _turn_on_debug():
    verbose_proxy_logger.setLevel(level=logging.DEBUG)  # set proxy logs to debug
 def _disable_debugging():
    verbose_logger.disabled = True
    verbose_router_logger.disabled = True
    verbose_proxy_logger.disabled = True
 def _enable_debugging():
    verbose_logger.disabled = False
    verbose_router_logger.disabled = False
    verbose_proxy_logger.disabled = False
 def print_verbose(print_statement):
    try:
        if set_verbose:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -487,6 +487,8 @@ def completion(
    ### ASYNC CALLS ###
    acompletion = kwargs.get("acompletion", False)
    client = kwargs.get("client", None)
    ### Admin Controls ###
    no_log = kwargs.get("no-log", False)
    ######## end of unpacking kwargs ###########
    openai_params = [
        "functions",
@ -563,6 +565,7 @@ def completion(
        "caching_groups",
        "ttl",
        "cache",
        "no-log",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -726,6 +729,7 @@ def completion(
            model_info=model_info,
            proxy_server_request=proxy_server_request,
            preset_cache_key=preset_cache_key,
            no_log=no_log,
        )
        logging.update_environment_variables(
            model=model,
@ -2417,6 +2421,7 @@ def embedding(
        "caching_groups",
        "ttl",
        "cache",
        "no-log",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -5,12 +5,9 @@ model_list:
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
      api_version: "2023-07-01-preview"
-  - model_name: azure-gpt-3.5
+litellm_settings:
-    litellm_params:
+  set_verbose: True
-      model: gpt-3.5-turbo
+  success_callback: ["langfuse"]
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      access_groups: ["public"]
 router_settings:
  set_verbose: True
  debug_level: "DEBUG"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1675,9 +1675,9 @@ class ProxyConfig:
                        # these are litellm callbacks - "langfuse", "sentry", "wandb"
                        else:
                            litellm.success_callback.append(callback)
-                    verbose_proxy_logger.debug(
+                    print(  # noqa
                        f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
-                    )
+                    )  # noqa
                elif key == "failure_callback":
                    litellm.failure_callback = []
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1279,6 +1279,15 @@ class Logging:
            for callback in callbacks:
                try:
                    litellm_params = self.model_call_details.get("litellm_params", {})
                    if litellm_params.get("no-log", False) == True:
                        # proxy cost tracking cal backs should run
                        if not (
                            isinstance(callback, CustomLogger)
                            and "_PROXY_" in callback.__class__.__name__
                        ):
                            print_verbose("no-log request, skipping logging")
                            continue
                    if callback == "lite_debugger":
                        print_verbose("reaches lite_debugger for logging!")
                        print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
@ -1707,7 +1716,20 @@ class Logging:
            callbacks = litellm._async_success_callback
        verbose_logger.debug(f"Async success callbacks: {callbacks}")
        for callback in callbacks:
            # check if callback can run for this request
            litellm_params = self.model_call_details.get("litellm_params", {})
            if litellm_params.get("no-log", False) == True:
                # proxy cost tracking cal backs should run
                if not (
                    isinstance(callback, CustomLogger)
                    and "_PROXY_" in callback.__class__.__name__
                ):
                    print_verbose("no-log request, skipping logging")
                    continue
            try:
                if kwargs.get("no-log", False) == True:
                    print_verbose("no-log request, skipping logging")
                    continue
                if callback == "cache" and litellm.cache is not None:
                    # set_cache once complete streaming response is built
                    print_verbose("async success_callback: reaches cache for logging!")
@ -2985,11 +3007,13 @@ def client(original_function):
            print_verbose(
                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
            )
            # check if user does not want this to be logged
            asyncio.create_task(
                logging_obj.async_success_handler(result, start_time, end_time)
            )
            threading.Thread(
-                target=logging_obj.success_handler, args=(result, start_time, end_time)
+                target=logging_obj.success_handler,
                args=(result, start_time, end_time),
            ).start()
            # RETURN RESULT
@ -3892,6 +3916,7 @@ def get_litellm_params(
    proxy_server_request=None,
    acompletion=None,
    preset_cache_key=None,
    no_log=None,
 ):
    litellm_params = {
        "acompletion": acompletion,
@ -3908,6 +3933,7 @@ def get_litellm_params(
        "model_info": model_info,
        "proxy_server_request": proxy_server_request,
        "preset_cache_key": preset_cache_key,
        "no-log": no_log,
        "stream_response": {},  # litellm_call_id: ModelResponse Dict
    }