fix(openai.py): coverage for correctly re-raising exception headers on openai chat completion + embedding endpoints

2024-08-24 12:55:15 -07:00 · 2024-08-24 12:55:15 -07:00 · de2373d52b
commit de2373d52b
parent 068aafdff9
3 changed files with 153 additions and 33 deletions
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -786,8 +786,14 @@ class OpenAIChatCompletion(BaseLLM):
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            return headers, response
-        except Exception as e:
+        except OpenAIError as e:
            raise e
        except Exception as e:
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            raise OpenAIError(
                status_code=status_code, message=str(e), headers=error_headers
            )
    def make_sync_openai_chat_completion_request(
        self,
@ -801,7 +807,6 @@ class OpenAIChatCompletion(BaseLLM):
        - call chat.completions.create by default
        """
        try:
            if litellm.return_response_headers is True:
            raw_response = openai_client.chat.completions.with_raw_response.create(
                **data, timeout=timeout
            )
@ -809,13 +814,14 @@ class OpenAIChatCompletion(BaseLLM):
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            return headers, response
-            else:
+        except OpenAIError as e:
                response = openai_client.chat.completions.create(
                    **data, timeout=timeout
                )
                return None, response
        except Exception as e:
            raise e
        except Exception as e:
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            raise OpenAIError(
                status_code=status_code, message=str(e), headers=error_headers
            )
    def completion(
        self,
@ -1290,16 +1296,12 @@ class OpenAIChatCompletion(BaseLLM):
        - call embeddings.create by default
        """
        try:
            if litellm.return_response_headers is True:
            raw_response = await openai_aclient.embeddings.with_raw_response.create(
                **data, timeout=timeout
            )  # type: ignore
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            return headers, response
            else:
                response = await openai_aclient.embeddings.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
@ -1365,14 +1367,14 @@ class OpenAIChatCompletion(BaseLLM):
                response_type="embedding",
                _response_headers=headers,
            )  # type: ignore
-        except Exception as e:
+        except OpenAIError as e:
            ## LOGGING
            logging_obj.post_call(
                input=input,
                api_key=api_key,
                original_response=str(e),
            )
            raise e
        except Exception as e:
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            raise OpenAIError(
                status_code=status_code, message=str(e), headers=error_headers
            )
    def embedding(
        self,
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception():
        )
    except litellm.BadRequestError:
        pass
 from typing import Optional, Union
 from openai import AsyncOpenAI, OpenAI
 def _pre_call_utils(
    call_type: str,
    data: dict,
    client: Union[OpenAI, AsyncOpenAI],
    sync_mode: bool,
    streaming: Optional[bool],
 ):
    if call_type == "embedding":
        data["input"] = "Hello world!"
        mapped_target = client.embeddings.with_raw_response
        if sync_mode:
            original_function = litellm.embedding
        else:
            original_function = litellm.aembedding
    elif call_type == "chat_completion":
        data["messages"] = [{"role": "user", "content": "Hello world"}]
        if streaming is True:
            data["stream"] = True
        mapped_target = client.chat.completions.with_raw_response
        if sync_mode:
            original_function = litellm.completion
        else:
            original_function = litellm.acompletion
    return data, original_function, mapped_target
@pytest.mark.parametrize(
    "sync_mode",
    [True, False],
 )
@pytest.mark.parametrize(
    "model, call_type, streaming",
    [
        ("text-embedding-ada-002", "embedding", None),
        ("gpt-3.5-turbo", "chat_completion", False),
        ("gpt-3.5-turbo", "chat_completion", True),
    ],
 )
@pytest.mark.asyncio
 async def test_exception_with_headers(sync_mode, model, call_type, streaming):
    """
    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
    but Azure says to retry in at most 9s
    ```
    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
    ```
    """
    import openai
    if sync_mode:
        openai_client = openai.OpenAI(api_key="")
    else:
        openai_client = openai.AsyncOpenAI(api_key="")
    data = {"model": model}
    data, original_function, mapped_target = _pre_call_utils(
        call_type=call_type,
        data=data,
        client=openai_client,
        sync_mode=sync_mode,
        streaming=streaming,
    )
    cooldown_time = 30.0
    def _return_exception(*args, **kwargs):
        from fastapi import HTTPException
        raise HTTPException(
            status_code=429,
            detail="Rate Limited!",
            headers={"retry-after": cooldown_time},  # type: ignore
        )
    with patch.object(
        mapped_target,
        "create",
        side_effect=_return_exception,
    ):
        new_retry_after_mock_client = MagicMock(return_value=-1)
        litellm.utils._get_retry_after_from_exception_header = (
            new_retry_after_mock_client
        )
        try:
            if sync_mode:
                resp = original_function(
                    model="text-embedding-ada-002",
                    input="Hello world!",
                    client=openai_client,
                )
                if streaming:
                    for chunk in resp:
                        continue
            else:
                resp = await original_function(
                    model="text-embedding-ada-002",
                    input="Hello world!",
                    client=openai_client,
                )
                if streaming:
                    async for chunk in resp:
                        continue
        except litellm.RateLimitError as e:
            assert e.litellm_response_headers is not None
            assert e.litellm_response_headers["retry-after"] == cooldown_time
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -2189,7 +2189,7 @@ def test_router_correctly_reraise_error():
        pass
-def test_router_dynamic_cooldown_correct_retry_after_time():
+def test_router_dynamic_cooldown_correct_retry_after_time(sync_mode):
    """
    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
    but Azure says to retry in at most 9s
@ -2219,7 +2219,7 @@ def test_router_dynamic_cooldown_correct_retry_after_time():
        raise HTTPException(
            status_code=429,
            detail="Rate Limited!",
-            headers={"retry-after": cooldown_time},
+            headers={"retry-after": cooldown_time},  # type: ignore
        )
    with patch.object(