fix(openai.py): coverage for correctly re-raising exception headers on openai chat completion + embedding endpoints

2025-04-25 18:54:30 +00:00 · 2024-08-24 12:55:15 -07:00 · 2024-08-24 12:55:15 -07:00 · de2373d52b
commit de2373d52b
parent 068aafdff9
3 changed files with 153 additions and 33 deletions
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception():
        )
    except litellm.BadRequestError:
        pass
+
+
+from typing import Optional, Union
+
+from openai import AsyncOpenAI, OpenAI
+
+
+def _pre_call_utils(
+    call_type: str,
+    data: dict,
+    client: Union[OpenAI, AsyncOpenAI],
+    sync_mode: bool,
+    streaming: Optional[bool],
+):
+    if call_type == "embedding":
+        data["input"] = "Hello world!"
+        mapped_target = client.embeddings.with_raw_response
+        if sync_mode:
+            original_function = litellm.embedding
+        else:
+            original_function = litellm.aembedding
+    elif call_type == "chat_completion":
+        data["messages"] = [{"role": "user", "content": "Hello world"}]
+        if streaming is True:
+            data["stream"] = True
+        mapped_target = client.chat.completions.with_raw_response
+        if sync_mode:
+            original_function = litellm.completion
+        else:
+            original_function = litellm.acompletion
+
+    return data, original_function, mapped_target
+
+
+@pytest.mark.parametrize(
+    "sync_mode",
+    [True, False],
+)
+@pytest.mark.parametrize(
+    "model, call_type, streaming",
+    [
+        ("text-embedding-ada-002", "embedding", None),
+        ("gpt-3.5-turbo", "chat_completion", False),
+        ("gpt-3.5-turbo", "chat_completion", True),
+    ],
+)
+@pytest.mark.asyncio
+async def test_exception_with_headers(sync_mode, model, call_type, streaming):
+    """
+    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
+    but Azure says to retry in at most 9s
+
+    ```
+    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
+    ```
+    """
+    import openai
+
+    if sync_mode:
+        openai_client = openai.OpenAI(api_key="")
+    else:
+        openai_client = openai.AsyncOpenAI(api_key="")
+
+    data = {"model": model}
+    data, original_function, mapped_target = _pre_call_utils(
+        call_type=call_type,
+        data=data,
+        client=openai_client,
+        sync_mode=sync_mode,
+        streaming=streaming,
+    )
+
+    cooldown_time = 30.0
+
+    def _return_exception(*args, **kwargs):
+        from fastapi import HTTPException
+
+        raise HTTPException(
+            status_code=429,
+            detail="Rate Limited!",
+            headers={"retry-after": cooldown_time},  # type: ignore
+        )
+
+    with patch.object(
+        mapped_target,
+        "create",
+        side_effect=_return_exception,
+    ):
+        new_retry_after_mock_client = MagicMock(return_value=-1)
+
+        litellm.utils._get_retry_after_from_exception_header = (
+            new_retry_after_mock_client
+        )
+
+        try:
+            if sync_mode:
+                resp = original_function(
+                    model="text-embedding-ada-002",
+                    input="Hello world!",
+                    client=openai_client,
+                )
+                if streaming:
+                    for chunk in resp:
+                        continue
+            else:
+                resp = await original_function(
+                    model="text-embedding-ada-002",
+                    input="Hello world!",
+                    client=openai_client,
+                )
+
+                if streaming:
+                    async for chunk in resp:
+                        continue
+
+        except litellm.RateLimitError as e:
+            assert e.litellm_response_headers is not None
+            assert e.litellm_response_headers["retry-after"] == cooldown_time