fix(openai.py): coverage for correctly re-raising exception headers on openai chat completion + embedding endpoints

This commit is contained in:
Krrish Dholakia 2024-08-24 12:55:15 -07:00
parent 068aafdff9
commit de2373d52b
3 changed files with 153 additions and 33 deletions

View file

@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception():
)
except litellm.BadRequestError:
pass
from typing import Optional, Union
from openai import AsyncOpenAI, OpenAI
def _pre_call_utils(
call_type: str,
data: dict,
client: Union[OpenAI, AsyncOpenAI],
sync_mode: bool,
streaming: Optional[bool],
):
if call_type == "embedding":
data["input"] = "Hello world!"
mapped_target = client.embeddings.with_raw_response
if sync_mode:
original_function = litellm.embedding
else:
original_function = litellm.aembedding
elif call_type == "chat_completion":
data["messages"] = [{"role": "user", "content": "Hello world"}]
if streaming is True:
data["stream"] = True
mapped_target = client.chat.completions.with_raw_response
if sync_mode:
original_function = litellm.completion
else:
original_function = litellm.acompletion
return data, original_function, mapped_target
@pytest.mark.parametrize(
"sync_mode",
[True, False],
)
@pytest.mark.parametrize(
"model, call_type, streaming",
[
("text-embedding-ada-002", "embedding", None),
("gpt-3.5-turbo", "chat_completion", False),
("gpt-3.5-turbo", "chat_completion", True),
],
)
@pytest.mark.asyncio
async def test_exception_with_headers(sync_mode, model, call_type, streaming):
"""
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
but Azure says to retry in at most 9s
```
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
```
"""
import openai
if sync_mode:
openai_client = openai.OpenAI(api_key="")
else:
openai_client = openai.AsyncOpenAI(api_key="")
data = {"model": model}
data, original_function, mapped_target = _pre_call_utils(
call_type=call_type,
data=data,
client=openai_client,
sync_mode=sync_mode,
streaming=streaming,
)
cooldown_time = 30.0
def _return_exception(*args, **kwargs):
from fastapi import HTTPException
raise HTTPException(
status_code=429,
detail="Rate Limited!",
headers={"retry-after": cooldown_time}, # type: ignore
)
with patch.object(
mapped_target,
"create",
side_effect=_return_exception,
):
new_retry_after_mock_client = MagicMock(return_value=-1)
litellm.utils._get_retry_after_from_exception_header = (
new_retry_after_mock_client
)
try:
if sync_mode:
resp = original_function(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
if streaming:
for chunk in resp:
continue
else:
resp = await original_function(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
if streaming:
async for chunk in resp:
continue
except litellm.RateLimitError as e:
assert e.litellm_response_headers is not None
assert e.litellm_response_headers["retry-after"] == cooldown_time