fix(utils.py): correctly re-raise the headers from an exception, if present

Fixes issue where retry after on router was not using azure / openai numbers
This commit is contained in:
Krrish Dholakia 2024-08-24 12:30:30 -07:00
parent 5a2c9d5121
commit 068aafdff9
6 changed files with 228 additions and 33 deletions

View file

@ -1,12 +1,12 @@
repos:
- repo: local
hooks:
- id: mypy
name: mypy
entry: python3 -m mypy --ignore-missing-imports
language: system
types: [python]
files: ^litellm/
# - id: mypy
# name: mypy
# entry: python3 -m mypy --ignore-missing-imports
# language: system
# types: [python]
# files: ^litellm/
- id: isort
name: isort
entry: isort

View file

@ -50,9 +50,11 @@ class OpenAIError(Exception):
message,
request: Optional[httpx.Request] = None,
response: Optional[httpx.Response] = None,
headers: Optional[httpx.Headers] = None,
):
self.status_code = status_code
self.message = message
self.headers = headers
if request:
self.request = request
else:
@ -113,7 +115,7 @@ class MistralConfig:
random_seed: Optional[int] = None,
safe_prompt: Optional[bool] = None,
response_format: Optional[dict] = None,
stop: Optional[Union[str, list]] = None
stop: Optional[Union[str, list]] = None,
) -> None:
locals_ = locals().copy()
for key, value in locals_.items():
@ -172,7 +174,7 @@ class MistralConfig:
if param == "top_p":
optional_params["top_p"] = value
if param == "stop":
optional_params["stop"] = value
optional_params["stop"] = value
if param == "tool_choice" and isinstance(value, str):
optional_params["tool_choice"] = self._map_tool_choice(
tool_choice=value
@ -1313,17 +1315,13 @@ class OpenAIChatCompletion(BaseLLM):
- call embeddings.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = openai_client.embeddings.with_raw_response.create(
**data, timeout=timeout
) # type: ignore
raw_response = openai_client.embeddings.with_raw_response.create(
**data, timeout=timeout
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
return None, response
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
except Exception as e:
raise e
@ -1448,13 +1446,13 @@ class OpenAIChatCompletion(BaseLLM):
response_type="embedding",
) # type: ignore
except OpenAIError as e:
exception_mapping_worked = True
raise e
except Exception as e:
if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e))
else:
raise OpenAIError(status_code=500, message=str(e))
status_code = getattr(e, "status_code", 500)
error_headers = getattr(e, "headers", None)
raise OpenAIError(
status_code=status_code, message=str(e), headers=error_headers
)
async def aimage_generation(
self,

View file

@ -90,6 +90,7 @@ from litellm.types.router import (
RetryPolicy,
RouterErrors,
RouterGeneralSettings,
RouterRateLimitError,
updateDeployment,
updateLiteLLMParams,
)
@ -1939,6 +1940,7 @@ class Router:
raise e
def _embedding(self, input: Union[str, List], model: str, **kwargs):
model_name = None
try:
verbose_router_logger.debug(
f"Inside embedding()- model: {model}; kwargs: {kwargs}"
@ -2813,19 +2815,27 @@ class Router:
):
return 0
response_headers: Optional[httpx.Headers] = None
if hasattr(e, "response") and hasattr(e.response, "headers"):
response_headers = e.response.headers
elif hasattr(e, "litellm_response_headers"):
response_headers = e.litellm_response_headers
if response_headers is not None:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
response_headers=response_headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
return timeout
def function_with_retries(self, *args, **kwargs):
@ -2997,8 +3007,9 @@ class Router:
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
exception_response = getattr(exception, "response", {})
exception_headers = getattr(exception_response, "headers", None)
exception_headers = litellm.utils._get_litellm_response_headers(
original_exception=exception
)
_time_to_cooldown = kwargs.get("litellm_params", {}).get(
"cooldown_time", self.cooldown_time
)
@ -4744,8 +4755,13 @@ class Router:
)
if len(healthy_deployments) == 0:
raise ValueError(
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, cooldown_list={self._get_cooldown_deployments()}"
_cooldown_time = self.cooldown_time # [TODO] Make dynamic
_cooldown_list = self._get_cooldown_deployments()
raise RouterRateLimitError(
model=model,
cooldown_time=_cooldown_time,
enable_pre_call_checks=self.enable_pre_call_checks,
cooldown_list=_cooldown_list,
)
if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:

View file

@ -10,6 +10,9 @@ import traceback
import openai
import pytest
import litellm.types
import litellm.types.router
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
@ -2184,3 +2187,126 @@ def test_router_correctly_reraise_error():
)
except litellm.RateLimitError:
pass
def test_router_dynamic_cooldown_correct_retry_after_time():
"""
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
but Azure says to retry in at most 9s
```
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
```
"""
router = Router(
model_list=[
{
"model_name": "text-embedding-ada-002",
"litellm_params": {
"model": "openai/text-embedding-ada-002",
},
}
]
)
openai_client = openai.OpenAI(api_key="")
cooldown_time = 30.0
def _return_exception(*args, **kwargs):
from fastapi import HTTPException
raise HTTPException(
status_code=429,
detail="Rate Limited!",
headers={"retry-after": cooldown_time},
)
with patch.object(
openai_client.embeddings.with_raw_response,
"create",
side_effect=_return_exception,
):
new_retry_after_mock_client = MagicMock(return_value=-1)
litellm.utils._get_retry_after_from_exception_header = (
new_retry_after_mock_client
)
try:
router.embedding(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
except litellm.RateLimitError:
pass
new_retry_after_mock_client.assert_called()
print(
f"new_retry_after_mock_client.call_args.kwargs: {new_retry_after_mock_client.call_args.kwargs}"
)
response_headers: httpx.Headers = new_retry_after_mock_client.call_args.kwargs[
"response_headers"
]
assert "retry-after" in response_headers
assert response_headers["retry-after"] == cooldown_time
def test_router_dynamic_cooldown_message_retry_time():
"""
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
but Azure says to retry in at most 9s
```
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
```
"""
router = Router(
model_list=[
{
"model_name": "text-embedding-ada-002",
"litellm_params": {
"model": "openai/text-embedding-ada-002",
},
}
]
)
openai_client = openai.OpenAI(api_key="")
cooldown_time = 30.0
def _return_exception(*args, **kwargs):
from fastapi import HTTPException
raise HTTPException(
status_code=429,
detail="Rate Limited!",
headers={"retry-after": cooldown_time},
)
with patch.object(
openai_client.embeddings.with_raw_response,
"create",
side_effect=_return_exception,
):
for _ in range(2):
try:
router.embedding(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
except litellm.RateLimitError:
pass
try:
router.embedding(
model="text-embedding-ada-002",
input="Hello world!",
client=openai_client,
)
except litellm.types.router.RouterRateLimitError as e:
assert e.cooldown_time == cooldown_time

View file

@ -549,3 +549,19 @@ class RouterGeneralSettings(BaseModel):
pass_through_all_models: bool = Field(
default=False
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
class RouterRateLimitError(ValueError):
def __init__(
self,
model: str,
cooldown_time: float,
enable_pre_call_checks: bool,
cooldown_list: List,
):
self.model = model
self.cooldown_time = cooldown_time
self.enable_pre_call_checks = enable_pre_call_checks
self.cooldown_list = cooldown_list
_message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
super().__init__(_message)

View file

@ -6339,6 +6339,7 @@ def _get_retry_after_from_exception_header(
retry_after = int(retry_date - time.time())
else:
retry_after = -1
return retry_after
except Exception as e:
@ -6520,6 +6521,40 @@ def get_model_list():
####### EXCEPTION MAPPING ################
def _get_litellm_response_headers(
original_exception: Exception,
) -> Optional[httpx.Headers]:
"""
Extract and return the response headers from a mapped exception, if present.
Used for accurate retry logic.
"""
_response_headers: Optional[httpx.Headers] = None
try:
_response_headers = getattr(
original_exception, "litellm_response_headers", None
)
except Exception:
return None
return _response_headers
def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
"""
Extract and return the response headers from an exception, if present.
Used for accurate retry logic.
"""
_response_headers: Optional[httpx.Headers] = None
try:
_response_headers = getattr(original_exception, "headers", None)
except Exception:
return None
return _response_headers
def exception_type(
model,
original_exception,
@ -6544,6 +6579,10 @@ def exception_type(
"LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa
) # noqa
print() # noqa
litellm_response_headers = _get_response_headers(
original_exception=original_exception
)
try:
if model:
if hasattr(original_exception, "message"):
@ -8422,20 +8461,20 @@ def exception_type(
threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
# don't let an error with mapping interrupt the user from receiving an error from the llm api calls
if exception_mapping_worked:
setattr(e, "litellm_response_headers", litellm_response_headers)
raise e
else:
for error_type in litellm.LITELLM_EXCEPTION_TYPES:
if isinstance(e, error_type):
setattr(e, "litellm_response_headers", litellm_response_headers)
raise e # it's already mapped
raise APIConnectionError(
raised_exc = APIConnectionError(
message="{}\n{}".format(original_exception, traceback.format_exc()),
llm_provider="",
model="",
request=httpx.Request(
method="POST",
url="https://www.litellm.ai/",
),
)
setattr(raised_exc, "litellm_response_headers", _response_headers)
raise raised_exc
######### Secret Manager ############################