forked from phoenix/litellm-mirror
fix(utils.py): correctly re-raise the headers from an exception, if present
Fixes issue where retry after on router was not using azure / openai numbers
This commit is contained in:
parent
5a2c9d5121
commit
068aafdff9
6 changed files with 228 additions and 33 deletions
|
@ -1,12 +1,12 @@
|
|||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy
|
||||
name: mypy
|
||||
entry: python3 -m mypy --ignore-missing-imports
|
||||
language: system
|
||||
types: [python]
|
||||
files: ^litellm/
|
||||
# - id: mypy
|
||||
# name: mypy
|
||||
# entry: python3 -m mypy --ignore-missing-imports
|
||||
# language: system
|
||||
# types: [python]
|
||||
# files: ^litellm/
|
||||
- id: isort
|
||||
name: isort
|
||||
entry: isort
|
||||
|
|
|
@ -50,9 +50,11 @@ class OpenAIError(Exception):
|
|||
message,
|
||||
request: Optional[httpx.Request] = None,
|
||||
response: Optional[httpx.Response] = None,
|
||||
headers: Optional[httpx.Headers] = None,
|
||||
):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
self.headers = headers
|
||||
if request:
|
||||
self.request = request
|
||||
else:
|
||||
|
@ -113,7 +115,7 @@ class MistralConfig:
|
|||
random_seed: Optional[int] = None,
|
||||
safe_prompt: Optional[bool] = None,
|
||||
response_format: Optional[dict] = None,
|
||||
stop: Optional[Union[str, list]] = None
|
||||
stop: Optional[Union[str, list]] = None,
|
||||
) -> None:
|
||||
locals_ = locals().copy()
|
||||
for key, value in locals_.items():
|
||||
|
@ -172,7 +174,7 @@ class MistralConfig:
|
|||
if param == "top_p":
|
||||
optional_params["top_p"] = value
|
||||
if param == "stop":
|
||||
optional_params["stop"] = value
|
||||
optional_params["stop"] = value
|
||||
if param == "tool_choice" and isinstance(value, str):
|
||||
optional_params["tool_choice"] = self._map_tool_choice(
|
||||
tool_choice=value
|
||||
|
@ -1313,17 +1315,13 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
- call embeddings.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = openai_client.embeddings.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
raw_response = openai_client.embeddings.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
return None, response
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
@ -1448,13 +1446,13 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
response_type="embedding",
|
||||
) # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
except Exception as e:
|
||||
if hasattr(e, "status_code"):
|
||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||
else:
|
||||
raise OpenAIError(status_code=500, message=str(e))
|
||||
status_code = getattr(e, "status_code", 500)
|
||||
error_headers = getattr(e, "headers", None)
|
||||
raise OpenAIError(
|
||||
status_code=status_code, message=str(e), headers=error_headers
|
||||
)
|
||||
|
||||
async def aimage_generation(
|
||||
self,
|
||||
|
|
|
@ -90,6 +90,7 @@ from litellm.types.router import (
|
|||
RetryPolicy,
|
||||
RouterErrors,
|
||||
RouterGeneralSettings,
|
||||
RouterRateLimitError,
|
||||
updateDeployment,
|
||||
updateLiteLLMParams,
|
||||
)
|
||||
|
@ -1939,6 +1940,7 @@ class Router:
|
|||
raise e
|
||||
|
||||
def _embedding(self, input: Union[str, List], model: str, **kwargs):
|
||||
model_name = None
|
||||
try:
|
||||
verbose_router_logger.debug(
|
||||
f"Inside embedding()- model: {model}; kwargs: {kwargs}"
|
||||
|
@ -2813,19 +2815,27 @@ class Router:
|
|||
):
|
||||
return 0
|
||||
|
||||
response_headers: Optional[httpx.Headers] = None
|
||||
if hasattr(e, "response") and hasattr(e.response, "headers"):
|
||||
response_headers = e.response.headers
|
||||
elif hasattr(e, "litellm_response_headers"):
|
||||
response_headers = e.litellm_response_headers
|
||||
|
||||
if response_headers is not None:
|
||||
timeout = litellm._calculate_retry_after(
|
||||
remaining_retries=remaining_retries,
|
||||
max_retries=num_retries,
|
||||
response_headers=e.response.headers,
|
||||
response_headers=response_headers,
|
||||
min_timeout=self.retry_after,
|
||||
)
|
||||
|
||||
else:
|
||||
timeout = litellm._calculate_retry_after(
|
||||
remaining_retries=remaining_retries,
|
||||
max_retries=num_retries,
|
||||
min_timeout=self.retry_after,
|
||||
)
|
||||
|
||||
return timeout
|
||||
|
||||
def function_with_retries(self, *args, **kwargs):
|
||||
|
@ -2997,8 +3007,9 @@ class Router:
|
|||
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
||||
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
||||
|
||||
exception_response = getattr(exception, "response", {})
|
||||
exception_headers = getattr(exception_response, "headers", None)
|
||||
exception_headers = litellm.utils._get_litellm_response_headers(
|
||||
original_exception=exception
|
||||
)
|
||||
_time_to_cooldown = kwargs.get("litellm_params", {}).get(
|
||||
"cooldown_time", self.cooldown_time
|
||||
)
|
||||
|
@ -4744,8 +4755,13 @@ class Router:
|
|||
)
|
||||
|
||||
if len(healthy_deployments) == 0:
|
||||
raise ValueError(
|
||||
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, cooldown_list={self._get_cooldown_deployments()}"
|
||||
_cooldown_time = self.cooldown_time # [TODO] Make dynamic
|
||||
_cooldown_list = self._get_cooldown_deployments()
|
||||
raise RouterRateLimitError(
|
||||
model=model,
|
||||
cooldown_time=_cooldown_time,
|
||||
enable_pre_call_checks=self.enable_pre_call_checks,
|
||||
cooldown_list=_cooldown_list,
|
||||
)
|
||||
|
||||
if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
|
||||
|
|
|
@ -10,6 +10,9 @@ import traceback
|
|||
import openai
|
||||
import pytest
|
||||
|
||||
import litellm.types
|
||||
import litellm.types.router
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
@ -2184,3 +2187,126 @@ def test_router_correctly_reraise_error():
|
|||
)
|
||||
except litellm.RateLimitError:
|
||||
pass
|
||||
|
||||
|
||||
def test_router_dynamic_cooldown_correct_retry_after_time():
|
||||
"""
|
||||
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||
but Azure says to retry in at most 9s
|
||||
|
||||
```
|
||||
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||
```
|
||||
"""
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "text-embedding-ada-002",
|
||||
"litellm_params": {
|
||||
"model": "openai/text-embedding-ada-002",
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
openai_client = openai.OpenAI(api_key="")
|
||||
|
||||
cooldown_time = 30.0
|
||||
|
||||
def _return_exception(*args, **kwargs):
|
||||
from fastapi import HTTPException
|
||||
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail="Rate Limited!",
|
||||
headers={"retry-after": cooldown_time},
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
openai_client.embeddings.with_raw_response,
|
||||
"create",
|
||||
side_effect=_return_exception,
|
||||
):
|
||||
new_retry_after_mock_client = MagicMock(return_value=-1)
|
||||
|
||||
litellm.utils._get_retry_after_from_exception_header = (
|
||||
new_retry_after_mock_client
|
||||
)
|
||||
|
||||
try:
|
||||
router.embedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="Hello world!",
|
||||
client=openai_client,
|
||||
)
|
||||
except litellm.RateLimitError:
|
||||
pass
|
||||
|
||||
new_retry_after_mock_client.assert_called()
|
||||
print(
|
||||
f"new_retry_after_mock_client.call_args.kwargs: {new_retry_after_mock_client.call_args.kwargs}"
|
||||
)
|
||||
|
||||
response_headers: httpx.Headers = new_retry_after_mock_client.call_args.kwargs[
|
||||
"response_headers"
|
||||
]
|
||||
assert "retry-after" in response_headers
|
||||
assert response_headers["retry-after"] == cooldown_time
|
||||
|
||||
|
||||
def test_router_dynamic_cooldown_message_retry_time():
|
||||
"""
|
||||
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||
but Azure says to retry in at most 9s
|
||||
|
||||
```
|
||||
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||
```
|
||||
"""
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "text-embedding-ada-002",
|
||||
"litellm_params": {
|
||||
"model": "openai/text-embedding-ada-002",
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
openai_client = openai.OpenAI(api_key="")
|
||||
|
||||
cooldown_time = 30.0
|
||||
|
||||
def _return_exception(*args, **kwargs):
|
||||
from fastapi import HTTPException
|
||||
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail="Rate Limited!",
|
||||
headers={"retry-after": cooldown_time},
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
openai_client.embeddings.with_raw_response,
|
||||
"create",
|
||||
side_effect=_return_exception,
|
||||
):
|
||||
for _ in range(2):
|
||||
try:
|
||||
router.embedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="Hello world!",
|
||||
client=openai_client,
|
||||
)
|
||||
except litellm.RateLimitError:
|
||||
pass
|
||||
|
||||
try:
|
||||
router.embedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="Hello world!",
|
||||
client=openai_client,
|
||||
)
|
||||
except litellm.types.router.RouterRateLimitError as e:
|
||||
assert e.cooldown_time == cooldown_time
|
||||
|
|
|
@ -549,3 +549,19 @@ class RouterGeneralSettings(BaseModel):
|
|||
pass_through_all_models: bool = Field(
|
||||
default=False
|
||||
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
|
||||
|
||||
|
||||
class RouterRateLimitError(ValueError):
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
cooldown_time: float,
|
||||
enable_pre_call_checks: bool,
|
||||
cooldown_list: List,
|
||||
):
|
||||
self.model = model
|
||||
self.cooldown_time = cooldown_time
|
||||
self.enable_pre_call_checks = enable_pre_call_checks
|
||||
self.cooldown_list = cooldown_list
|
||||
_message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
|
||||
super().__init__(_message)
|
||||
|
|
|
@ -6339,6 +6339,7 @@ def _get_retry_after_from_exception_header(
|
|||
retry_after = int(retry_date - time.time())
|
||||
else:
|
||||
retry_after = -1
|
||||
|
||||
return retry_after
|
||||
|
||||
except Exception as e:
|
||||
|
@ -6520,6 +6521,40 @@ def get_model_list():
|
|||
|
||||
|
||||
####### EXCEPTION MAPPING ################
|
||||
def _get_litellm_response_headers(
|
||||
original_exception: Exception,
|
||||
) -> Optional[httpx.Headers]:
|
||||
"""
|
||||
Extract and return the response headers from a mapped exception, if present.
|
||||
|
||||
Used for accurate retry logic.
|
||||
"""
|
||||
_response_headers: Optional[httpx.Headers] = None
|
||||
try:
|
||||
_response_headers = getattr(
|
||||
original_exception, "litellm_response_headers", None
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return _response_headers
|
||||
|
||||
|
||||
def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
|
||||
"""
|
||||
Extract and return the response headers from an exception, if present.
|
||||
|
||||
Used for accurate retry logic.
|
||||
"""
|
||||
_response_headers: Optional[httpx.Headers] = None
|
||||
try:
|
||||
_response_headers = getattr(original_exception, "headers", None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return _response_headers
|
||||
|
||||
|
||||
def exception_type(
|
||||
model,
|
||||
original_exception,
|
||||
|
@ -6544,6 +6579,10 @@ def exception_type(
|
|||
"LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa
|
||||
) # noqa
|
||||
print() # noqa
|
||||
|
||||
litellm_response_headers = _get_response_headers(
|
||||
original_exception=original_exception
|
||||
)
|
||||
try:
|
||||
if model:
|
||||
if hasattr(original_exception, "message"):
|
||||
|
@ -8422,20 +8461,20 @@ def exception_type(
|
|||
threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
|
||||
# don't let an error with mapping interrupt the user from receiving an error from the llm api calls
|
||||
if exception_mapping_worked:
|
||||
setattr(e, "litellm_response_headers", litellm_response_headers)
|
||||
raise e
|
||||
else:
|
||||
for error_type in litellm.LITELLM_EXCEPTION_TYPES:
|
||||
if isinstance(e, error_type):
|
||||
setattr(e, "litellm_response_headers", litellm_response_headers)
|
||||
raise e # it's already mapped
|
||||
raise APIConnectionError(
|
||||
raised_exc = APIConnectionError(
|
||||
message="{}\n{}".format(original_exception, traceback.format_exc()),
|
||||
llm_provider="",
|
||||
model="",
|
||||
request=httpx.Request(
|
||||
method="POST",
|
||||
url="https://www.litellm.ai/",
|
||||
),
|
||||
)
|
||||
setattr(raised_exc, "litellm_response_headers", _response_headers)
|
||||
raise raised_exc
|
||||
|
||||
|
||||
######### Secret Manager ############################
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue