fix(utils.py): correctly re-raise the headers from an exception, if present

Fixes issue where retry after on router was not using azure / openai numbers
2024-08-24 12:30:30 -07:00 · 2024-08-24 12:30:30 -07:00 · 068aafdff9
commit 068aafdff9
parent 5a2c9d5121
6 changed files with 228 additions and 33 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,12 +1,12 @@
 repos:
 -   repo: local
    hooks:
-    -   id: mypy
-        name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
-        language: system
-        types: [python]
-        files: ^litellm/
+    # -   id: mypy
+    #     name: mypy
+    #     entry: python3 -m mypy --ignore-missing-imports
+    #     language: system
+    #     types: [python]
+    #     files: ^litellm/
    -   id: isort
        name: isort
        entry: isort
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -50,9 +50,11 @@ class OpenAIError(Exception):
        message,
        request: Optional[httpx.Request] = None,
        response: Optional[httpx.Response] = None,
+        headers: Optional[httpx.Headers] = None,
    ):
        self.status_code = status_code
        self.message = message
+        self.headers = headers
        if request:
            self.request = request
        else:
@ -113,7 +115,7 @@ class MistralConfig:
        random_seed: Optional[int] = None,
        safe_prompt: Optional[bool] = None,
        response_format: Optional[dict] = None,
-        stop: Optional[Union[str, list]] = None
+        stop: Optional[Union[str, list]] = None,
    ) -> None:
        locals_ = locals().copy()
        for key, value in locals_.items():
@ -172,7 +174,7 @@ class MistralConfig:
            if param == "top_p":
                optional_params["top_p"] = value
            if param == "stop":
-                optional_params["stop"] = value                
+                optional_params["stop"] = value
            if param == "tool_choice" and isinstance(value, str):
                optional_params["tool_choice"] = self._map_tool_choice(
                    tool_choice=value
@ -1313,17 +1315,13 @@ class OpenAIChatCompletion(BaseLLM):
        - call embeddings.create by default
        """
        try:
-            if litellm.return_response_headers is True:
-                raw_response = openai_client.embeddings.with_raw_response.create(
-                    **data, timeout=timeout
-                )  # type: ignore
+            raw_response = openai_client.embeddings.with_raw_response.create(
+                **data, timeout=timeout
+            )  # type: ignore

-                headers = dict(raw_response.headers)
-                response = raw_response.parse()
-                return headers, response
-            else:
-                response = openai_client.embeddings.create(**data, timeout=timeout)  # type: ignore
-                return None, response
+            headers = dict(raw_response.headers)
+            response = raw_response.parse()
+            return headers, response
        except Exception as e:
            raise e

@ -1448,13 +1446,13 @@ class OpenAIChatCompletion(BaseLLM):
                response_type="embedding",
            )  # type: ignore
        except OpenAIError as e:
-            exception_mapping_worked = True
            raise e
        except Exception as e:
-            if hasattr(e, "status_code"):
-                raise OpenAIError(status_code=e.status_code, message=str(e))
-            else:
-                raise OpenAIError(status_code=500, message=str(e))
+            status_code = getattr(e, "status_code", 500)
+            error_headers = getattr(e, "headers", None)
+            raise OpenAIError(
+                status_code=status_code, message=str(e), headers=error_headers
+            )

    async def aimage_generation(
        self,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -90,6 +90,7 @@ from litellm.types.router import (
    RetryPolicy,
    RouterErrors,
    RouterGeneralSettings,
+    RouterRateLimitError,
    updateDeployment,
    updateLiteLLMParams,
 )
@ -1939,6 +1940,7 @@ class Router:
            raise e

    def _embedding(self, input: Union[str, List], model: str, **kwargs):
+        model_name = None
        try:
            verbose_router_logger.debug(
                f"Inside embedding()- model: {model}; kwargs: {kwargs}"
@ -2813,19 +2815,27 @@ class Router:
        ):
            return 0

+        response_headers: Optional[httpx.Headers] = None
        if hasattr(e, "response") and hasattr(e.response, "headers"):
+            response_headers = e.response.headers
+        elif hasattr(e, "litellm_response_headers"):
+            response_headers = e.litellm_response_headers
+
+        if response_headers is not None:
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
-                response_headers=e.response.headers,
+                response_headers=response_headers,
                min_timeout=self.retry_after,
            )
+
        else:
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
                min_timeout=self.retry_after,
            )
+
        return timeout

    def function_with_retries(self, *args, **kwargs):
@ -2997,8 +3007,9 @@ class Router:
            metadata = kwargs.get("litellm_params", {}).get("metadata", None)
            _model_info = kwargs.get("litellm_params", {}).get("model_info", {})

-            exception_response = getattr(exception, "response", {})
-            exception_headers = getattr(exception_response, "headers", None)
+            exception_headers = litellm.utils._get_litellm_response_headers(
+                original_exception=exception
+            )
            _time_to_cooldown = kwargs.get("litellm_params", {}).get(
                "cooldown_time", self.cooldown_time
            )
@ -4744,8 +4755,13 @@ class Router:
            )

        if len(healthy_deployments) == 0:
-            raise ValueError(
-                f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, cooldown_list={self._get_cooldown_deployments()}"
+            _cooldown_time = self.cooldown_time  # [TODO] Make dynamic
+            _cooldown_list = self._get_cooldown_deployments()
+            raise RouterRateLimitError(
+                model=model,
+                cooldown_time=_cooldown_time,
+                enable_pre_call_checks=self.enable_pre_call_checks,
+                cooldown_list=_cooldown_list,
            )

        if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -10,6 +10,9 @@ import traceback
 import openai
 import pytest

+import litellm.types
+import litellm.types.router
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
@ -2184,3 +2187,126 @@ def test_router_correctly_reraise_error():
        )
    except litellm.RateLimitError:
        pass
+
+
+def test_router_dynamic_cooldown_correct_retry_after_time():
+    """
+    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
+    but Azure says to retry in at most 9s
+
+    ```
+    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
+    ```
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "text-embedding-ada-002",
+                "litellm_params": {
+                    "model": "openai/text-embedding-ada-002",
+                },
+            }
+        ]
+    )
+
+    openai_client = openai.OpenAI(api_key="")
+
+    cooldown_time = 30.0
+
+    def _return_exception(*args, **kwargs):
+        from fastapi import HTTPException
+
+        raise HTTPException(
+            status_code=429,
+            detail="Rate Limited!",
+            headers={"retry-after": cooldown_time},
+        )
+
+    with patch.object(
+        openai_client.embeddings.with_raw_response,
+        "create",
+        side_effect=_return_exception,
+    ):
+        new_retry_after_mock_client = MagicMock(return_value=-1)
+
+        litellm.utils._get_retry_after_from_exception_header = (
+            new_retry_after_mock_client
+        )
+
+        try:
+            router.embedding(
+                model="text-embedding-ada-002",
+                input="Hello world!",
+                client=openai_client,
+            )
+        except litellm.RateLimitError:
+            pass
+
+        new_retry_after_mock_client.assert_called()
+        print(
+            f"new_retry_after_mock_client.call_args.kwargs: {new_retry_after_mock_client.call_args.kwargs}"
+        )
+
+        response_headers: httpx.Headers = new_retry_after_mock_client.call_args.kwargs[
+            "response_headers"
+        ]
+        assert "retry-after" in response_headers
+        assert response_headers["retry-after"] == cooldown_time
+
+
+def test_router_dynamic_cooldown_message_retry_time():
+    """
+    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
+    but Azure says to retry in at most 9s
+
+    ```
+    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
+    ```
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "text-embedding-ada-002",
+                "litellm_params": {
+                    "model": "openai/text-embedding-ada-002",
+                },
+            }
+        ]
+    )
+
+    openai_client = openai.OpenAI(api_key="")
+
+    cooldown_time = 30.0
+
+    def _return_exception(*args, **kwargs):
+        from fastapi import HTTPException
+
+        raise HTTPException(
+            status_code=429,
+            detail="Rate Limited!",
+            headers={"retry-after": cooldown_time},
+        )
+
+    with patch.object(
+        openai_client.embeddings.with_raw_response,
+        "create",
+        side_effect=_return_exception,
+    ):
+        for _ in range(2):
+            try:
+                router.embedding(
+                    model="text-embedding-ada-002",
+                    input="Hello world!",
+                    client=openai_client,
+                )
+            except litellm.RateLimitError:
+                pass
+
+        try:
+            router.embedding(
+                model="text-embedding-ada-002",
+                input="Hello world!",
+                client=openai_client,
+            )
+        except litellm.types.router.RouterRateLimitError as e:
+            assert e.cooldown_time == cooldown_time
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -549,3 +549,19 @@ class RouterGeneralSettings(BaseModel):
    pass_through_all_models: bool = Field(
        default=False
    )  # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
+
+
+class RouterRateLimitError(ValueError):
+    def __init__(
+        self,
+        model: str,
+        cooldown_time: float,
+        enable_pre_call_checks: bool,
+        cooldown_list: List,
+    ):
+        self.model = model
+        self.cooldown_time = cooldown_time
+        self.enable_pre_call_checks = enable_pre_call_checks
+        self.cooldown_list = cooldown_list
+        _message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
+        super().__init__(_message)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6339,6 +6339,7 @@ def _get_retry_after_from_exception_header(
                    retry_after = int(retry_date - time.time())
        else:
            retry_after = -1
+
        return retry_after

    except Exception as e:
@ -6520,6 +6521,40 @@ def get_model_list():


 ####### EXCEPTION MAPPING ################
+def _get_litellm_response_headers(
+    original_exception: Exception,
+) -> Optional[httpx.Headers]:
+    """
+    Extract and return the response headers from a mapped exception, if present.
+
+    Used for accurate retry logic.
+    """
+    _response_headers: Optional[httpx.Headers] = None
+    try:
+        _response_headers = getattr(
+            original_exception, "litellm_response_headers", None
+        )
+    except Exception:
+        return None
+
+    return _response_headers
+
+
+def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
+    """
+    Extract and return the response headers from an exception, if present.
+
+    Used for accurate retry logic.
+    """
+    _response_headers: Optional[httpx.Headers] = None
+    try:
+        _response_headers = getattr(original_exception, "headers", None)
+    except Exception:
+        return None
+
+    return _response_headers
+
+
 def exception_type(
    model,
    original_exception,
@ -6544,6 +6579,10 @@ def exception_type(
            "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
        )  # noqa
        print()  # noqa
+
+    litellm_response_headers = _get_response_headers(
+        original_exception=original_exception
+    )
    try:
        if model:
            if hasattr(original_exception, "message"):
@ -8422,20 +8461,20 @@ def exception_type(
            threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
        # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
        if exception_mapping_worked:
+            setattr(e, "litellm_response_headers", litellm_response_headers)
            raise e
        else:
            for error_type in litellm.LITELLM_EXCEPTION_TYPES:
                if isinstance(e, error_type):
+                    setattr(e, "litellm_response_headers", litellm_response_headers)
                    raise e  # it's already mapped
-            raise APIConnectionError(
+            raised_exc = APIConnectionError(
                message="{}\n{}".format(original_exception, traceback.format_exc()),
                llm_provider="",
                model="",
-                request=httpx.Request(
-                    method="POST",
-                    url="https://www.litellm.ai/",
-                ),
            )
+            setattr(raised_exc, "litellm_response_headers", _response_headers)
+            raise raised_exc


 ######### Secret Manager ############################