diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 9b33f3cac..41ba526ac 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -786,8 +786,14 @@ class OpenAIChatCompletion(BaseLLM): headers = dict(raw_response.headers) response = raw_response.parse() return headers, response - except Exception as e: + except OpenAIError as e: raise e + except Exception as e: + status_code = getattr(e, "status_code", 500) + error_headers = getattr(e, "headers", None) + raise OpenAIError( + status_code=status_code, message=str(e), headers=error_headers + ) def make_sync_openai_chat_completion_request( self, @@ -801,21 +807,21 @@ class OpenAIChatCompletion(BaseLLM): - call chat.completions.create by default """ try: - if litellm.return_response_headers is True: - raw_response = openai_client.chat.completions.with_raw_response.create( - **data, timeout=timeout - ) + raw_response = openai_client.chat.completions.with_raw_response.create( + **data, timeout=timeout + ) - headers = dict(raw_response.headers) - response = raw_response.parse() - return headers, response - else: - response = openai_client.chat.completions.create( - **data, timeout=timeout - ) - return None, response - except Exception as e: + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + except OpenAIError as e: raise e + except Exception as e: + status_code = getattr(e, "status_code", 500) + error_headers = getattr(e, "headers", None) + raise OpenAIError( + status_code=status_code, message=str(e), headers=error_headers + ) def completion( self, @@ -1290,16 +1296,12 @@ class OpenAIChatCompletion(BaseLLM): - call embeddings.create by default """ try: - if litellm.return_response_headers is True: - raw_response = await openai_aclient.embeddings.with_raw_response.create( - **data, timeout=timeout - ) # type: ignore - headers = dict(raw_response.headers) - response = raw_response.parse() - return headers, response - else: - response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore - return None, response + raw_response = await openai_aclient.embeddings.with_raw_response.create( + **data, timeout=timeout + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response except Exception as e: raise e @@ -1365,14 +1367,14 @@ class OpenAIChatCompletion(BaseLLM): response_type="embedding", _response_headers=headers, ) # type: ignore - except Exception as e: - ## LOGGING - logging_obj.post_call( - input=input, - api_key=api_key, - original_response=str(e), - ) + except OpenAIError as e: raise e + except Exception as e: + status_code = getattr(e, "status_code", 500) + error_headers = getattr(e, "headers", None) + raise OpenAIError( + status_code=status_code, message=str(e), headers=error_headers + ) def embedding( self, diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index 806e19569..a53e8f3a9 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception(): ) except litellm.BadRequestError: pass + + +from typing import Optional, Union + +from openai import AsyncOpenAI, OpenAI + + +def _pre_call_utils( + call_type: str, + data: dict, + client: Union[OpenAI, AsyncOpenAI], + sync_mode: bool, + streaming: Optional[bool], +): + if call_type == "embedding": + data["input"] = "Hello world!" + mapped_target = client.embeddings.with_raw_response + if sync_mode: + original_function = litellm.embedding + else: + original_function = litellm.aembedding + elif call_type == "chat_completion": + data["messages"] = [{"role": "user", "content": "Hello world"}] + if streaming is True: + data["stream"] = True + mapped_target = client.chat.completions.with_raw_response + if sync_mode: + original_function = litellm.completion + else: + original_function = litellm.acompletion + + return data, original_function, mapped_target + + +@pytest.mark.parametrize( + "sync_mode", + [True, False], +) +@pytest.mark.parametrize( + "model, call_type, streaming", + [ + ("text-embedding-ada-002", "embedding", None), + ("gpt-3.5-turbo", "chat_completion", False), + ("gpt-3.5-turbo", "chat_completion", True), + ], +) +@pytest.mark.asyncio +async def test_exception_with_headers(sync_mode, model, call_type, streaming): + """ + User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds" + but Azure says to retry in at most 9s + + ``` + {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"} + ``` + """ + import openai + + if sync_mode: + openai_client = openai.OpenAI(api_key="") + else: + openai_client = openai.AsyncOpenAI(api_key="") + + data = {"model": model} + data, original_function, mapped_target = _pre_call_utils( + call_type=call_type, + data=data, + client=openai_client, + sync_mode=sync_mode, + streaming=streaming, + ) + + cooldown_time = 30.0 + + def _return_exception(*args, **kwargs): + from fastapi import HTTPException + + raise HTTPException( + status_code=429, + detail="Rate Limited!", + headers={"retry-after": cooldown_time}, # type: ignore + ) + + with patch.object( + mapped_target, + "create", + side_effect=_return_exception, + ): + new_retry_after_mock_client = MagicMock(return_value=-1) + + litellm.utils._get_retry_after_from_exception_header = ( + new_retry_after_mock_client + ) + + try: + if sync_mode: + resp = original_function( + model="text-embedding-ada-002", + input="Hello world!", + client=openai_client, + ) + if streaming: + for chunk in resp: + continue + else: + resp = await original_function( + model="text-embedding-ada-002", + input="Hello world!", + client=openai_client, + ) + + if streaming: + async for chunk in resp: + continue + + except litellm.RateLimitError as e: + assert e.litellm_response_headers is not None + assert e.litellm_response_headers["retry-after"] == cooldown_time diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py index 2bf4f55b9..6e58a04f9 100644 --- a/litellm/tests/test_router.py +++ b/litellm/tests/test_router.py @@ -2189,7 +2189,7 @@ def test_router_correctly_reraise_error(): pass -def test_router_dynamic_cooldown_correct_retry_after_time(): +def test_router_dynamic_cooldown_correct_retry_after_time(sync_mode): """ User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds" but Azure says to retry in at most 9s @@ -2219,7 +2219,7 @@ def test_router_dynamic_cooldown_correct_retry_after_time(): raise HTTPException( status_code=429, detail="Rate Limited!", - headers={"retry-after": cooldown_time}, + headers={"retry-after": cooldown_time}, # type: ignore ) with patch.object(