fix(openai.py): coverage for correctly re-raising exception headers on openai chat completion + embedding endpoints

This commit is contained in:
Krrish Dholakia 2024-08-24 12:55:15 -07:00
parent 068aafdff9
commit de2373d52b
3 changed files with 153 additions and 33 deletions

View file

@ -786,8 +786,14 @@ class OpenAIChatCompletion(BaseLLM):
headers = dict(raw_response.headers) headers = dict(raw_response.headers)
response = raw_response.parse() response = raw_response.parse()
return headers, response return headers, response
except Exception as e: except OpenAIError as e:
raise e raise e
except Exception as e:
status_code = getattr(e, "status_code", 500)
error_headers = getattr(e, "headers", None)
raise OpenAIError(
status_code=status_code, message=str(e), headers=error_headers
)
def make_sync_openai_chat_completion_request( def make_sync_openai_chat_completion_request(
self, self,
@ -801,21 +807,21 @@ class OpenAIChatCompletion(BaseLLM):
- call chat.completions.create by default - call chat.completions.create by default
""" """
try: try:
if litellm.return_response_headers is True: raw_response = openai_client.chat.completions.with_raw_response.create(
raw_response = openai_client.chat.completions.with_raw_response.create( **data, timeout=timeout
**data, timeout=timeout )
)
headers = dict(raw_response.headers) headers = dict(raw_response.headers)
response = raw_response.parse() response = raw_response.parse()
return headers, response return headers, response
else: except OpenAIError as e:
response = openai_client.chat.completions.create(
**data, timeout=timeout
)
return None, response
except Exception as e:
raise e raise e
except Exception as e:
status_code = getattr(e, "status_code", 500)
error_headers = getattr(e, "headers", None)
raise OpenAIError(
status_code=status_code, message=str(e), headers=error_headers
)
def completion( def completion(
self, self,
@ -1290,16 +1296,12 @@ class OpenAIChatCompletion(BaseLLM):
- call embeddings.create by default - call embeddings.create by default
""" """
try: try:
if litellm.return_response_headers is True: raw_response = await openai_aclient.embeddings.with_raw_response.create(
raw_response = await openai_aclient.embeddings.with_raw_response.create( **data, timeout=timeout
**data, timeout=timeout ) # type: ignore
) # type: ignore headers = dict(raw_response.headers)
headers = dict(raw_response.headers) response = raw_response.parse()
response = raw_response.parse() return headers, response
return headers, response
else:
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
return None, response
except Exception as e: except Exception as e:
raise e raise e
@ -1365,14 +1367,14 @@ class OpenAIChatCompletion(BaseLLM):
response_type="embedding", response_type="embedding",
_response_headers=headers, _response_headers=headers,
) # type: ignore ) # type: ignore
except Exception as e: except OpenAIError as e:
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
original_response=str(e),
)
raise e raise e
except Exception as e:
status_code = getattr(e, "status_code", 500)
error_headers = getattr(e, "headers", None)
raise OpenAIError(
status_code=status_code, message=str(e), headers=error_headers
)
def embedding( def embedding(
self, self,

View file

@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception():
) )
except litellm.BadRequestError: except litellm.BadRequestError:
pass pass
from typing import Optional, Union
from openai import AsyncOpenAI, OpenAI
def _pre_call_utils(
call_type: str,
data: dict,
client: Union[OpenAI, AsyncOpenAI],
sync_mode: bool,
streaming: Optional[bool],
):
if call_type == "embedding":
data["input"] = "Hello world!"
mapped_target = client.embeddings.with_raw_response
if sync_mode:
original_function = litellm.embedding
else:
original_function = litellm.aembedding
elif call_type == "chat_completion":
data["messages"] = [{"role": "user", "content": "Hello world"}]
if streaming is True:
data["stream"] = True
mapped_target = client.chat.completions.with_raw_response
if sync_mode:
original_function = litellm.completion
else:
original_function = litellm.acompletion
return data, original_function, mapped_target
@pytest.mark.parametrize(
"sync_mode",
[True, False],
)
@pytest.mark.parametrize(
"model, call_type, streaming",
[
("text-embedding-ada-002", "embedding", None),
("gpt-3.5-turbo", "chat_completion", False),
("gpt-3.5-turbo", "chat_completion", True),
],
)
@pytest.mark.asyncio
async def test_exception_with_headers(sync_mode, model, call_type, streaming):
"""
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
but Azure says to retry in at most 9s
```
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
```
"""
import openai
if sync_mode:
openai_client = openai.OpenAI(api_key="")
else:
openai_client = openai.AsyncOpenAI(api_key="")
data = {"model": model}
data, original_function, mapped_target = _pre_call_utils(
call_type=call_type,
data=data,
client=openai_client,
sync_mode=sync_mode,
streaming=streaming,
)
cooldown_time = 30.0
def _return_exception(*args, **kwargs):
from fastapi import HTTPException
raise HTTPException(
status_code=429,
detail="Rate Limited!",
headers={"retry-after": cooldown_time}, # type: ignore
)
with patch.object(
mapped_target,
"create",
side_effect=_return_exception,
):
new_retry_after_mock_client = MagicMock(return_value=-1)
litellm.utils._get_retry_after_from_exception_header = (
new_retry_after_mock_client
)
try:
if sync_mode:
resp = original_function(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
if streaming:
for chunk in resp:
continue
else:
resp = await original_function(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
if streaming:
async for chunk in resp:
continue
except litellm.RateLimitError as e:
assert e.litellm_response_headers is not None
assert e.litellm_response_headers["retry-after"] == cooldown_time

View file

@ -2189,7 +2189,7 @@ def test_router_correctly_reraise_error():
pass pass
def test_router_dynamic_cooldown_correct_retry_after_time(): def test_router_dynamic_cooldown_correct_retry_after_time(sync_mode):
""" """
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds" User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
but Azure says to retry in at most 9s but Azure says to retry in at most 9s
@ -2219,7 +2219,7 @@ def test_router_dynamic_cooldown_correct_retry_after_time():
raise HTTPException( raise HTTPException(
status_code=429, status_code=429,
detail="Rate Limited!", detail="Rate Limited!",
headers={"retry-after": cooldown_time}, headers={"retry-after": cooldown_time}, # type: ignore
) )
with patch.object( with patch.object(