fix(utils.py): correctly re-raise the headers from an exception, if present

Fixes issue where retry after on router was not using azure / openai numbers
2024-08-24 12:30:30 -07:00 · 2024-08-24 12:30:30 -07:00 · 068aafdff9
commit 068aafdff9
parent 5a2c9d5121
6 changed files with 228 additions and 33 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,12 +1,12 @@
 repos:
 -   repo: local
    hooks:
-    -   id: mypy
+    # -   id: mypy
-        name: mypy
+    #     name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
+    #     entry: python3 -m mypy --ignore-missing-imports
-        language: system
+    #     language: system
-        types: [python]
+    #     types: [python]
-        files: ^litellm/
+    #     files: ^litellm/
    -   id: isort
        name: isort
        entry: isort
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -50,9 +50,11 @@ class OpenAIError(Exception):
        message,
        request: Optional[httpx.Request] = None,
        response: Optional[httpx.Response] = None,
        headers: Optional[httpx.Headers] = None,
    ):
        self.status_code = status_code
        self.message = message
        self.headers = headers
        if request:
            self.request = request
        else:
@ -113,7 +115,7 @@ class MistralConfig:
        random_seed: Optional[int] = None,
        safe_prompt: Optional[bool] = None,
        response_format: Optional[dict] = None,
-        stop: Optional[Union[str, list]] = None
+        stop: Optional[Union[str, list]] = None,
    ) -> None:
        locals_ = locals().copy()
        for key, value in locals_.items():
@ -172,7 +174,7 @@ class MistralConfig:
            if param == "top_p":
                optional_params["top_p"] = value
            if param == "stop":
-                optional_params["stop"] = value                
+                optional_params["stop"] = value
            if param == "tool_choice" and isinstance(value, str):
                optional_params["tool_choice"] = self._map_tool_choice(
                    tool_choice=value
@ -1313,17 +1315,13 @@ class OpenAIChatCompletion(BaseLLM):
        - call embeddings.create by default
        """
        try:
-            if litellm.return_response_headers is True:
+            raw_response = openai_client.embeddings.with_raw_response.create(
-                raw_response = openai_client.embeddings.with_raw_response.create(
+                **data, timeout=timeout
-                    **data, timeout=timeout
+            )  # type: ignore
                )  # type: ignore
-                headers = dict(raw_response.headers)
+            headers = dict(raw_response.headers)
-                response = raw_response.parse()
+            response = raw_response.parse()
-                return headers, response
+            return headers, response
            else:
                response = openai_client.embeddings.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
@ -1448,13 +1446,13 @@ class OpenAIChatCompletion(BaseLLM):
                response_type="embedding",
            )  # type: ignore
        except OpenAIError as e:
            exception_mapping_worked = True
            raise e
        except Exception as e:
-            if hasattr(e, "status_code"):
+            status_code = getattr(e, "status_code", 500)
-                raise OpenAIError(status_code=e.status_code, message=str(e))
+            error_headers = getattr(e, "headers", None)
-            else:
+            raise OpenAIError(
-                raise OpenAIError(status_code=500, message=str(e))
+                status_code=status_code, message=str(e), headers=error_headers
            )
    async def aimage_generation(
        self,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -90,6 +90,7 @@ from litellm.types.router import (
    RetryPolicy,
    RouterErrors,
    RouterGeneralSettings,
    RouterRateLimitError,
    updateDeployment,
    updateLiteLLMParams,
 )
@ -1939,6 +1940,7 @@ class Router:
            raise e
    def _embedding(self, input: Union[str, List], model: str, **kwargs):
        model_name = None
        try:
            verbose_router_logger.debug(
                f"Inside embedding()- model: {model}; kwargs: {kwargs}"
@ -2813,19 +2815,27 @@ class Router:
        ):
            return 0
        response_headers: Optional[httpx.Headers] = None
        if hasattr(e, "response") and hasattr(e.response, "headers"):
            response_headers = e.response.headers
        elif hasattr(e, "litellm_response_headers"):
            response_headers = e.litellm_response_headers
        if response_headers is not None:
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
-                response_headers=e.response.headers,
+                response_headers=response_headers,
                min_timeout=self.retry_after,
            )
        else:
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
                min_timeout=self.retry_after,
            )
        return timeout
    def function_with_retries(self, *args, **kwargs):
@ -2997,8 +3007,9 @@ class Router:
            metadata = kwargs.get("litellm_params", {}).get("metadata", None)
            _model_info = kwargs.get("litellm_params", {}).get("model_info", {})
-            exception_response = getattr(exception, "response", {})
+            exception_headers = litellm.utils._get_litellm_response_headers(
-            exception_headers = getattr(exception_response, "headers", None)
+                original_exception=exception
            )
            _time_to_cooldown = kwargs.get("litellm_params", {}).get(
                "cooldown_time", self.cooldown_time
            )
@ -4744,8 +4755,13 @@ class Router:
            )
        if len(healthy_deployments) == 0:
-            raise ValueError(
+            _cooldown_time = self.cooldown_time  # [TODO] Make dynamic
-                f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, cooldown_list={self._get_cooldown_deployments()}"
+            _cooldown_list = self._get_cooldown_deployments()
            raise RouterRateLimitError(
                model=model,
                cooldown_time=_cooldown_time,
                enable_pre_call_checks=self.enable_pre_call_checks,
                cooldown_list=_cooldown_list,
            )
        if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -10,6 +10,9 @@ import traceback
 import openai
 import pytest
 import litellm.types
 import litellm.types.router
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
@ -2184,3 +2187,126 @@ def test_router_correctly_reraise_error():
        )
    except litellm.RateLimitError:
        pass
 def test_router_dynamic_cooldown_correct_retry_after_time():
    """
    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
    but Azure says to retry in at most 9s
    ```
    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
    ```
    """
    router = Router(
        model_list=[
            {
                "model_name": "text-embedding-ada-002",
                "litellm_params": {
                    "model": "openai/text-embedding-ada-002",
                },
            }
        ]
    )
    openai_client = openai.OpenAI(api_key="")
    cooldown_time = 30.0
    def _return_exception(*args, **kwargs):
        from fastapi import HTTPException
        raise HTTPException(
            status_code=429,
            detail="Rate Limited!",
            headers={"retry-after": cooldown_time},
        )
    with patch.object(
        openai_client.embeddings.with_raw_response,
        "create",
        side_effect=_return_exception,
    ):
        new_retry_after_mock_client = MagicMock(return_value=-1)
        litellm.utils._get_retry_after_from_exception_header = (
            new_retry_after_mock_client
        )
        try:
            router.embedding(
                model="text-embedding-ada-002",
                input="Hello world!",
                client=openai_client,
            )
        except litellm.RateLimitError:
            pass
        new_retry_after_mock_client.assert_called()
        print(
            f"new_retry_after_mock_client.call_args.kwargs: {new_retry_after_mock_client.call_args.kwargs}"
        )
        response_headers: httpx.Headers = new_retry_after_mock_client.call_args.kwargs[
            "response_headers"
        ]
        assert "retry-after" in response_headers
        assert response_headers["retry-after"] == cooldown_time
 def test_router_dynamic_cooldown_message_retry_time():
    """
    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
    but Azure says to retry in at most 9s
    ```
    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
    ```
    """
    router = Router(
        model_list=[
            {
                "model_name": "text-embedding-ada-002",
                "litellm_params": {
                    "model": "openai/text-embedding-ada-002",
                },
            }
        ]
    )
    openai_client = openai.OpenAI(api_key="")
    cooldown_time = 30.0
    def _return_exception(*args, **kwargs):
        from fastapi import HTTPException
        raise HTTPException(
            status_code=429,
            detail="Rate Limited!",
            headers={"retry-after": cooldown_time},
        )
    with patch.object(
        openai_client.embeddings.with_raw_response,
        "create",
        side_effect=_return_exception,
    ):
        for _ in range(2):
            try:
                router.embedding(
                    model="text-embedding-ada-002",
                    input="Hello world!",
                    client=openai_client,
                )
            except litellm.RateLimitError:
                pass
        try:
            router.embedding(
                model="text-embedding-ada-002",
                input="Hello world!",
                client=openai_client,
            )
        except litellm.types.router.RouterRateLimitError as e:
            assert e.cooldown_time == cooldown_time
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -549,3 +549,19 @@ class RouterGeneralSettings(BaseModel):
    pass_through_all_models: bool = Field(
        default=False
    )  # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
 class RouterRateLimitError(ValueError):
    def __init__(
        self,
        model: str,
        cooldown_time: float,
        enable_pre_call_checks: bool,
        cooldown_list: List,
    ):
        self.model = model
        self.cooldown_time = cooldown_time
        self.enable_pre_call_checks = enable_pre_call_checks
        self.cooldown_list = cooldown_list
        _message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
        super().__init__(_message)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6339,6 +6339,7 @@ def _get_retry_after_from_exception_header(
                    retry_after = int(retry_date - time.time())
        else:
            retry_after = -1
        return retry_after
    except Exception as e:
@ -6520,6 +6521,40 @@ def get_model_list():
 ####### EXCEPTION MAPPING ################
 def _get_litellm_response_headers(
    original_exception: Exception,
 ) -> Optional[httpx.Headers]:
    """
    Extract and return the response headers from a mapped exception, if present.
    Used for accurate retry logic.
    """
    _response_headers: Optional[httpx.Headers] = None
    try:
        _response_headers = getattr(
            original_exception, "litellm_response_headers", None
        )
    except Exception:
        return None
    return _response_headers
 def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
    """
    Extract and return the response headers from an exception, if present.
    Used for accurate retry logic.
    """
    _response_headers: Optional[httpx.Headers] = None
    try:
        _response_headers = getattr(original_exception, "headers", None)
    except Exception:
        return None
    return _response_headers
 def exception_type(
    model,
    original_exception,
@ -6544,6 +6579,10 @@ def exception_type(
            "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
        )  # noqa
        print()  # noqa
    litellm_response_headers = _get_response_headers(
        original_exception=original_exception
    )
    try:
        if model:
            if hasattr(original_exception, "message"):
@ -8422,20 +8461,20 @@ def exception_type(
            threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
        # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
        if exception_mapping_worked:
            setattr(e, "litellm_response_headers", litellm_response_headers)
            raise e
        else:
            for error_type in litellm.LITELLM_EXCEPTION_TYPES:
                if isinstance(e, error_type):
                    setattr(e, "litellm_response_headers", litellm_response_headers)
                    raise e  # it's already mapped
-            raise APIConnectionError(
+            raised_exc = APIConnectionError(
                message="{}\n{}".format(original_exception, traceback.format_exc()),
                llm_provider="",
                model="",
                request=httpx.Request(
                    method="POST",
                    url="https://www.litellm.ai/",
                ),
            )
            setattr(raised_exc, "litellm_response_headers", _response_headers)
            raise raised_exc
 ######### Secret Manager ############################