forked from phoenix/litellm-mirror
fix(openai.py): coverage for correctly re-raising exception headers on openai chat completion + embedding endpoints
This commit is contained in:
parent
068aafdff9
commit
de2373d52b
3 changed files with 153 additions and 33 deletions
|
@ -786,8 +786,14 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
headers = dict(raw_response.headers)
|
headers = dict(raw_response.headers)
|
||||||
response = raw_response.parse()
|
response = raw_response.parse()
|
||||||
return headers, response
|
return headers, response
|
||||||
except Exception as e:
|
except OpenAIError as e:
|
||||||
raise e
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
status_code = getattr(e, "status_code", 500)
|
||||||
|
error_headers = getattr(e, "headers", None)
|
||||||
|
raise OpenAIError(
|
||||||
|
status_code=status_code, message=str(e), headers=error_headers
|
||||||
|
)
|
||||||
|
|
||||||
def make_sync_openai_chat_completion_request(
|
def make_sync_openai_chat_completion_request(
|
||||||
self,
|
self,
|
||||||
|
@ -801,7 +807,6 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
- call chat.completions.create by default
|
- call chat.completions.create by default
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if litellm.return_response_headers is True:
|
|
||||||
raw_response = openai_client.chat.completions.with_raw_response.create(
|
raw_response = openai_client.chat.completions.with_raw_response.create(
|
||||||
**data, timeout=timeout
|
**data, timeout=timeout
|
||||||
)
|
)
|
||||||
|
@ -809,13 +814,14 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
headers = dict(raw_response.headers)
|
headers = dict(raw_response.headers)
|
||||||
response = raw_response.parse()
|
response = raw_response.parse()
|
||||||
return headers, response
|
return headers, response
|
||||||
else:
|
except OpenAIError as e:
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
**data, timeout=timeout
|
|
||||||
)
|
|
||||||
return None, response
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
status_code = getattr(e, "status_code", 500)
|
||||||
|
error_headers = getattr(e, "headers", None)
|
||||||
|
raise OpenAIError(
|
||||||
|
status_code=status_code, message=str(e), headers=error_headers
|
||||||
|
)
|
||||||
|
|
||||||
def completion(
|
def completion(
|
||||||
self,
|
self,
|
||||||
|
@ -1290,16 +1296,12 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
- call embeddings.create by default
|
- call embeddings.create by default
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if litellm.return_response_headers is True:
|
|
||||||
raw_response = await openai_aclient.embeddings.with_raw_response.create(
|
raw_response = await openai_aclient.embeddings.with_raw_response.create(
|
||||||
**data, timeout=timeout
|
**data, timeout=timeout
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
headers = dict(raw_response.headers)
|
headers = dict(raw_response.headers)
|
||||||
response = raw_response.parse()
|
response = raw_response.parse()
|
||||||
return headers, response
|
return headers, response
|
||||||
else:
|
|
||||||
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
|
|
||||||
return None, response
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1365,14 +1367,14 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
response_type="embedding",
|
response_type="embedding",
|
||||||
_response_headers=headers,
|
_response_headers=headers,
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
except Exception as e:
|
except OpenAIError as e:
|
||||||
## LOGGING
|
|
||||||
logging_obj.post_call(
|
|
||||||
input=input,
|
|
||||||
api_key=api_key,
|
|
||||||
original_response=str(e),
|
|
||||||
)
|
|
||||||
raise e
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
status_code = getattr(e, "status_code", 500)
|
||||||
|
error_headers = getattr(e, "headers", None)
|
||||||
|
raise OpenAIError(
|
||||||
|
status_code=status_code, message=str(e), headers=error_headers
|
||||||
|
)
|
||||||
|
|
||||||
def embedding(
|
def embedding(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -839,3 +839,121 @@ def test_anthropic_tool_calling_exception():
|
||||||
)
|
)
|
||||||
except litellm.BadRequestError:
|
except litellm.BadRequestError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI, OpenAI
|
||||||
|
|
||||||
|
|
||||||
|
def _pre_call_utils(
|
||||||
|
call_type: str,
|
||||||
|
data: dict,
|
||||||
|
client: Union[OpenAI, AsyncOpenAI],
|
||||||
|
sync_mode: bool,
|
||||||
|
streaming: Optional[bool],
|
||||||
|
):
|
||||||
|
if call_type == "embedding":
|
||||||
|
data["input"] = "Hello world!"
|
||||||
|
mapped_target = client.embeddings.with_raw_response
|
||||||
|
if sync_mode:
|
||||||
|
original_function = litellm.embedding
|
||||||
|
else:
|
||||||
|
original_function = litellm.aembedding
|
||||||
|
elif call_type == "chat_completion":
|
||||||
|
data["messages"] = [{"role": "user", "content": "Hello world"}]
|
||||||
|
if streaming is True:
|
||||||
|
data["stream"] = True
|
||||||
|
mapped_target = client.chat.completions.with_raw_response
|
||||||
|
if sync_mode:
|
||||||
|
original_function = litellm.completion
|
||||||
|
else:
|
||||||
|
original_function = litellm.acompletion
|
||||||
|
|
||||||
|
return data, original_function, mapped_target
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sync_mode",
|
||||||
|
[True, False],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model, call_type, streaming",
|
||||||
|
[
|
||||||
|
("text-embedding-ada-002", "embedding", None),
|
||||||
|
("gpt-3.5-turbo", "chat_completion", False),
|
||||||
|
("gpt-3.5-turbo", "chat_completion", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_exception_with_headers(sync_mode, model, call_type, streaming):
|
||||||
|
"""
|
||||||
|
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||||
|
but Azure says to retry in at most 9s
|
||||||
|
|
||||||
|
```
|
||||||
|
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
import openai
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
openai_client = openai.OpenAI(api_key="")
|
||||||
|
else:
|
||||||
|
openai_client = openai.AsyncOpenAI(api_key="")
|
||||||
|
|
||||||
|
data = {"model": model}
|
||||||
|
data, original_function, mapped_target = _pre_call_utils(
|
||||||
|
call_type=call_type,
|
||||||
|
data=data,
|
||||||
|
client=openai_client,
|
||||||
|
sync_mode=sync_mode,
|
||||||
|
streaming=streaming,
|
||||||
|
)
|
||||||
|
|
||||||
|
cooldown_time = 30.0
|
||||||
|
|
||||||
|
def _return_exception(*args, **kwargs):
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail="Rate Limited!",
|
||||||
|
headers={"retry-after": cooldown_time}, # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
mapped_target,
|
||||||
|
"create",
|
||||||
|
side_effect=_return_exception,
|
||||||
|
):
|
||||||
|
new_retry_after_mock_client = MagicMock(return_value=-1)
|
||||||
|
|
||||||
|
litellm.utils._get_retry_after_from_exception_header = (
|
||||||
|
new_retry_after_mock_client
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if sync_mode:
|
||||||
|
resp = original_function(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Hello world!",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
if streaming:
|
||||||
|
for chunk in resp:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
resp = await original_function(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Hello world!",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
|
||||||
|
if streaming:
|
||||||
|
async for chunk in resp:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except litellm.RateLimitError as e:
|
||||||
|
assert e.litellm_response_headers is not None
|
||||||
|
assert e.litellm_response_headers["retry-after"] == cooldown_time
|
||||||
|
|
|
@ -2189,7 +2189,7 @@ def test_router_correctly_reraise_error():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_router_dynamic_cooldown_correct_retry_after_time():
|
def test_router_dynamic_cooldown_correct_retry_after_time(sync_mode):
|
||||||
"""
|
"""
|
||||||
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||||
but Azure says to retry in at most 9s
|
but Azure says to retry in at most 9s
|
||||||
|
@ -2219,7 +2219,7 @@ def test_router_dynamic_cooldown_correct_retry_after_time():
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429,
|
status_code=429,
|
||||||
detail="Rate Limited!",
|
detail="Rate Limited!",
|
||||||
headers={"retry-after": cooldown_time},
|
headers={"retry-after": cooldown_time}, # type: ignore
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch.object(
|
with patch.object(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue