(feat) log proxy auth errors on datadog (#6931)

* add new dd type for auth errors * add async_log_proxy_authentication_errors * fix comment * use async_log_proxy_authentication_errors * test_datadog_post_call_failure_hook * test_async_log_proxy_authentication_errors
2024-11-26 20:26:57 -08:00 · 2024-11-26 20:26:57 -08:00 · 4bc06392db
commit 4bc06392db
parent aea68cbeb6
7 changed files with 241 additions and 9 deletions
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -32,10 +32,11 @@ from litellm.llms.custom_httpx.http_handler import (
    get_async_httpx_client,
    httpxSpecialProvider,
 )
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.types.integrations.datadog import *
 from litellm.types.services import ServiceLoggerPayload
 from litellm.types.utils import StandardLoggingPayload

-from .types import DD_ERRORS, DatadogPayload, DataDogStatus
 from .utils import make_json_serializable

 DD_MAX_BATCH_SIZE = 1000  # max number of logs DD API can accept
@ -364,6 +365,38 @@ class DataDogLogger(CustomBatchLogger):
        """
        return

+    async def async_post_call_failure_hook(
+        self,
+        request_data: dict,
+        original_exception: Exception,
+        user_api_key_dict: UserAPIKeyAuth,
+    ):
+        """
+        Handles Proxy Errors (not-related to LLM API), ex: Authentication Errors
+        """
+        import json
+
+        _exception_payload = DatadogProxyFailureHookJsonMessage(
+            exception=str(original_exception),
+            error_class=str(original_exception.__class__.__name__),
+            status_code=getattr(original_exception, "status_code", None),
+            traceback=traceback.format_exc(),
+            user_api_key_dict=user_api_key_dict.model_dump(),
+        )
+
+        json_payload = json.dumps(_exception_payload)
+        verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
+        dd_payload = DatadogPayload(
+            ddsource=os.getenv("DD_SOURCE", "litellm"),
+            ddtags="",
+            hostname="",
+            message=json_payload,
+            service="litellm-server",
+            status=DataDogStatus.ERROR,
+        )
+
+        self.log_queue.append(dd_payload)
+
    def _create_v0_logging_payload(
        self,
        kwargs: Union[dict, Any],
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2032,7 +2032,6 @@
        "tool_use_system_prompt_tokens": 264,
        "supports_assistant_prefill": true,
        "supports_prompt_caching": true,
-        "supports_pdf_input": true,
        "supports_response_schema": true
    },
    "claude-3-opus-20240229": {
@ -2098,6 +2097,7 @@
        "supports_vision": true,
        "tool_use_system_prompt_tokens": 159,
        "supports_assistant_prefill": true,
+        "supports_pdf_input": true,
        "supports_prompt_caching": true,
        "supports_response_schema": true
    },
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -1197,12 +1197,14 @@ async def user_api_key_auth(  # noqa: PLR0915
            extra={"requester_ip": requester_ip},
        )

-        # Log this exception to OTEL
-        if open_telemetry_logger is not None:
-            await open_telemetry_logger.async_post_call_failure_hook(  # type: ignore
+        # Log this exception to OTEL, Datadog etc
+        asyncio.create_task(
+            proxy_logging_obj.async_log_proxy_authentication_errors(
                original_exception=e,
-                request_data={},
-                user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
+                request=request,
+                parent_otel_span=parent_otel_span,
+                api_key=api_key,
+            )
        )

        if isinstance(e, litellm.BudgetExceededError):
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -854,6 +854,20 @@ class ProxyLogging:
                    ),
                ).start()

+        await self._run_post_call_failure_hook_custom_loggers(
+            original_exception=original_exception,
+            request_data=request_data,
+            user_api_key_dict=user_api_key_dict,
+        )
+
+        return
+
+    async def _run_post_call_failure_hook_custom_loggers(
+        self,
+        original_exception: Exception,
+        request_data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+    ):
        for callback in litellm.callbacks:
            try:
                _callback: Optional[CustomLogger] = None
@ -872,7 +886,34 @@ class ProxyLogging:
            except Exception as e:
                raise e

-        return
+    async def async_log_proxy_authentication_errors(
+        self,
+        original_exception: Exception,
+        request: Request,
+        parent_otel_span: Optional[Any],
+        api_key: str,
+    ):
+        """
+        Handler for Logging Authentication Errors on LiteLLM Proxy
+        Why not use post_call_failure_hook?
+        - `post_call_failure_hook` calls `litellm_logging_obj.async_failure_handler`. This led to the Exception being logged twice
+
+        What does this handler do?
+        - Logs Authentication Errors (like invalid API Key passed) to CustomLogger compatible classes (OTEL, Datadog etc)
+            - calls CustomLogger.async_post_call_failure_hook
+        """
+
+        user_api_key_dict = UserAPIKeyAuth(
+            parent_otel_span=parent_otel_span,
+            token=_hash_token_if_needed(token=api_key),
+        )
+        request_data = await request.json()
+        await self._run_post_call_failure_hook_custom_loggers(
+            original_exception=original_exception,
+            request_data=request_data,
+            user_api_key_dict=user_api_key_dict,
+        )
+        pass

    async def post_call_success_hook(
        self,
--- a/litellm/types/integrations/datadog.py
+++ b/litellm/types/integrations/datadog.py
@ -1,5 +1,5 @@
 from enum import Enum
-from typing import TypedDict
+from typing import Optional, TypedDict


 class DataDogStatus(str, Enum):
@ -19,3 +19,11 @@ class DatadogPayload(TypedDict, total=False):

 class DD_ERRORS(Enum):
    DATADOG_413_ERROR = "Datadog API Error - Payload too large (batch is above 5MB uncompressed). If you want this logged either disable request/response logging or set `DD_BATCH_SIZE=50`"
+
+
+class DatadogProxyFailureHookJsonMessage(TypedDict, total=False):
+    exception: str
+    error_class: str
+    status_code: Optional[int]
+    traceback: str
+    user_api_key_dict: dict
--- a/tests/logging_callback_tests/test_datadog.py
+++ b/tests/logging_callback_tests/test_datadog.py
@ -344,3 +344,81 @@ async def test_datadog_logging():
        await asyncio.sleep(5)
    except Exception as e:
        print(e)
+
+
+@pytest.mark.asyncio
+async def test_datadog_post_call_failure_hook():
+    """Test logging proxy failures (e.g., authentication errors) to DataDog"""
+    try:
+        from litellm.integrations.datadog.datadog import DataDogLogger
+
+        os.environ["DD_SITE"] = "https://fake.datadoghq.com"
+        os.environ["DD_API_KEY"] = "anything"
+        dd_logger = DataDogLogger()
+
+        # Create a mock for the async_client's post method
+        mock_post = AsyncMock()
+        mock_post.return_value.status_code = 202
+        mock_post.return_value.text = "Accepted"
+        dd_logger.async_client.post = mock_post
+
+        # Create a test exception
+        class AuthenticationError(Exception):
+            def __init__(self):
+                self.status_code = 401
+                super().__init__("Invalid API key")
+
+        test_exception = AuthenticationError()
+
+        # Create test request data and user API key dict
+        request_data = {
+            "model": "gpt-4",
+            "messages": [{"role": "user", "content": "Hello"}],
+        }
+
+        user_api_key_dict = UserAPIKeyAuth(
+            api_key="fake_key", user_id="test_user", team_id="test_team"
+        )
+
+        # Call the failure hook
+        await dd_logger.async_post_call_failure_hook(
+            request_data=request_data,
+            original_exception=test_exception,
+            user_api_key_dict=user_api_key_dict,
+        )
+
+        # Wait for the periodic flush
+        await asyncio.sleep(6)
+
+        # Assert that the mock was called
+        assert mock_post.called, "HTTP request was not made"
+
+        # Get the arguments of the last call
+        args, kwargs = mock_post.call_args
+
+        # Verify endpoint
+        assert kwargs["url"].endswith("/api/v2/logs"), "Incorrect DataDog endpoint"
+
+        # Decode and verify payload
+        body = kwargs["data"]
+        with gzip.open(io.BytesIO(body), "rb") as f:
+            body = f.read().decode("utf-8")
+
+        body = json.loads(body)
+        assert len(body) == 1, "Expected one log entry"
+
+        log_entry = body[0]
+        assert log_entry["status"] == "error", "Expected error status"
+        assert log_entry["service"] == "litellm-server"
+
+        # Verify message content
+        message = json.loads(log_entry["message"])
+        print("logged message", json.dumps(message, indent=2))
+        assert message["exception"] == "Invalid API key"
+        assert message["error_class"] == "AuthenticationError"
+        assert message["status_code"] == 401
+        assert "traceback" in message
+        assert message["user_api_key_dict"]["api_key"] == "fake_key"
+
+    except Exception as e:
+        pytest.fail(f"Test failed with exception: {str(e)}")
--- a/tests/proxy_unit_tests/test_proxy_server.py
+++ b/tests/proxy_unit_tests/test_proxy_server.py
@ -2125,3 +2125,73 @@ async def test_proxy_server_prisma_setup_invalid_db():

    if _old_db_url:
        os.environ["DATABASE_URL"] = _old_db_url
+
+
+@pytest.mark.asyncio
+async def test_async_log_proxy_authentication_errors():
+    """
+    Test if async_log_proxy_authentication_errors correctly logs authentication errors through custom loggers
+    """
+    import json
+    from fastapi import Request
+    from litellm.proxy.utils import ProxyLogging
+    from litellm.caching import DualCache
+    from litellm.integrations.custom_logger import CustomLogger
+
+    # Create a mock custom logger to verify it's called
+    class MockCustomLogger(CustomLogger):
+        def __init__(self):
+            self.called = False
+            self.exception_logged = None
+            self.request_data_logged = None
+            self.user_api_key_dict_logged = None
+
+        async def async_post_call_failure_hook(
+            self,
+            request_data: dict,
+            original_exception: Exception,
+            user_api_key_dict: UserAPIKeyAuth,
+        ):
+            self.called = True
+            self.exception_logged = original_exception
+            self.request_data_logged = request_data
+            print("logged request_data", request_data)
+            if isinstance(request_data, AsyncMock):
+                self.request_data_logged = (
+                    await request_data()
+                )  # get the actual value from AsyncMock
+            else:
+                self.request_data_logged = request_data
+            self.user_api_key_dict_logged = user_api_key_dict
+
+    # Create test data
+    test_data = {"model": "gpt-4", "messages": [{"role": "user", "content": "Hello"}]}
+
+    # Create a mock request
+    request = Request(scope={"type": "http", "method": "POST"})
+    request._json = AsyncMock(return_value=test_data)
+
+    # Create a test exception
+    test_exception = Exception("Invalid API Key")
+
+    # Initialize ProxyLogging
+    mock_logger = MockCustomLogger()
+    litellm.callbacks = [mock_logger]
+    proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
+
+    # Call the method
+    await proxy_logging_obj.async_log_proxy_authentication_errors(
+        original_exception=test_exception,
+        request=request,
+        parent_otel_span=None,
+        api_key="test-key",
+    )
+
+    # Verify the mock logger was called with correct parameters
+    assert mock_logger.called == True
+    assert mock_logger.exception_logged == test_exception
+    assert mock_logger.request_data_logged == test_data
+    assert mock_logger.user_api_key_dict_logged is not None
+    assert (
+        mock_logger.user_api_key_dict_logged.token is not None
+    )  # token should be hashed