forked from phoenix/litellm-mirror
fix(utils.py): correctly re-raise the headers from an exception, if present
Fixes issue where retry after on router was not using azure / openai numbers
This commit is contained in:
parent
5a2c9d5121
commit
068aafdff9
6 changed files with 228 additions and 33 deletions
|
@ -1,12 +1,12 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy
|
# - id: mypy
|
||||||
name: mypy
|
# name: mypy
|
||||||
entry: python3 -m mypy --ignore-missing-imports
|
# entry: python3 -m mypy --ignore-missing-imports
|
||||||
language: system
|
# language: system
|
||||||
types: [python]
|
# types: [python]
|
||||||
files: ^litellm/
|
# files: ^litellm/
|
||||||
- id: isort
|
- id: isort
|
||||||
name: isort
|
name: isort
|
||||||
entry: isort
|
entry: isort
|
||||||
|
|
|
@ -50,9 +50,11 @@ class OpenAIError(Exception):
|
||||||
message,
|
message,
|
||||||
request: Optional[httpx.Request] = None,
|
request: Optional[httpx.Request] = None,
|
||||||
response: Optional[httpx.Response] = None,
|
response: Optional[httpx.Response] = None,
|
||||||
|
headers: Optional[httpx.Headers] = None,
|
||||||
):
|
):
|
||||||
self.status_code = status_code
|
self.status_code = status_code
|
||||||
self.message = message
|
self.message = message
|
||||||
|
self.headers = headers
|
||||||
if request:
|
if request:
|
||||||
self.request = request
|
self.request = request
|
||||||
else:
|
else:
|
||||||
|
@ -113,7 +115,7 @@ class MistralConfig:
|
||||||
random_seed: Optional[int] = None,
|
random_seed: Optional[int] = None,
|
||||||
safe_prompt: Optional[bool] = None,
|
safe_prompt: Optional[bool] = None,
|
||||||
response_format: Optional[dict] = None,
|
response_format: Optional[dict] = None,
|
||||||
stop: Optional[Union[str, list]] = None
|
stop: Optional[Union[str, list]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
locals_ = locals().copy()
|
locals_ = locals().copy()
|
||||||
for key, value in locals_.items():
|
for key, value in locals_.items():
|
||||||
|
@ -172,7 +174,7 @@ class MistralConfig:
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
optional_params["top_p"] = value
|
optional_params["top_p"] = value
|
||||||
if param == "stop":
|
if param == "stop":
|
||||||
optional_params["stop"] = value
|
optional_params["stop"] = value
|
||||||
if param == "tool_choice" and isinstance(value, str):
|
if param == "tool_choice" and isinstance(value, str):
|
||||||
optional_params["tool_choice"] = self._map_tool_choice(
|
optional_params["tool_choice"] = self._map_tool_choice(
|
||||||
tool_choice=value
|
tool_choice=value
|
||||||
|
@ -1313,17 +1315,13 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
- call embeddings.create by default
|
- call embeddings.create by default
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if litellm.return_response_headers is True:
|
raw_response = openai_client.embeddings.with_raw_response.create(
|
||||||
raw_response = openai_client.embeddings.with_raw_response.create(
|
**data, timeout=timeout
|
||||||
**data, timeout=timeout
|
) # type: ignore
|
||||||
) # type: ignore
|
|
||||||
|
|
||||||
headers = dict(raw_response.headers)
|
headers = dict(raw_response.headers)
|
||||||
response = raw_response.parse()
|
response = raw_response.parse()
|
||||||
return headers, response
|
return headers, response
|
||||||
else:
|
|
||||||
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
|
|
||||||
return None, response
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1448,13 +1446,13 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
response_type="embedding",
|
response_type="embedding",
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
except OpenAIError as e:
|
except OpenAIError as e:
|
||||||
exception_mapping_worked = True
|
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if hasattr(e, "status_code"):
|
status_code = getattr(e, "status_code", 500)
|
||||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
error_headers = getattr(e, "headers", None)
|
||||||
else:
|
raise OpenAIError(
|
||||||
raise OpenAIError(status_code=500, message=str(e))
|
status_code=status_code, message=str(e), headers=error_headers
|
||||||
|
)
|
||||||
|
|
||||||
async def aimage_generation(
|
async def aimage_generation(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -90,6 +90,7 @@ from litellm.types.router import (
|
||||||
RetryPolicy,
|
RetryPolicy,
|
||||||
RouterErrors,
|
RouterErrors,
|
||||||
RouterGeneralSettings,
|
RouterGeneralSettings,
|
||||||
|
RouterRateLimitError,
|
||||||
updateDeployment,
|
updateDeployment,
|
||||||
updateLiteLLMParams,
|
updateLiteLLMParams,
|
||||||
)
|
)
|
||||||
|
@ -1939,6 +1940,7 @@ class Router:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def _embedding(self, input: Union[str, List], model: str, **kwargs):
|
def _embedding(self, input: Union[str, List], model: str, **kwargs):
|
||||||
|
model_name = None
|
||||||
try:
|
try:
|
||||||
verbose_router_logger.debug(
|
verbose_router_logger.debug(
|
||||||
f"Inside embedding()- model: {model}; kwargs: {kwargs}"
|
f"Inside embedding()- model: {model}; kwargs: {kwargs}"
|
||||||
|
@ -2813,19 +2815,27 @@ class Router:
|
||||||
):
|
):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
response_headers: Optional[httpx.Headers] = None
|
||||||
if hasattr(e, "response") and hasattr(e.response, "headers"):
|
if hasattr(e, "response") and hasattr(e.response, "headers"):
|
||||||
|
response_headers = e.response.headers
|
||||||
|
elif hasattr(e, "litellm_response_headers"):
|
||||||
|
response_headers = e.litellm_response_headers
|
||||||
|
|
||||||
|
if response_headers is not None:
|
||||||
timeout = litellm._calculate_retry_after(
|
timeout = litellm._calculate_retry_after(
|
||||||
remaining_retries=remaining_retries,
|
remaining_retries=remaining_retries,
|
||||||
max_retries=num_retries,
|
max_retries=num_retries,
|
||||||
response_headers=e.response.headers,
|
response_headers=response_headers,
|
||||||
min_timeout=self.retry_after,
|
min_timeout=self.retry_after,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
timeout = litellm._calculate_retry_after(
|
timeout = litellm._calculate_retry_after(
|
||||||
remaining_retries=remaining_retries,
|
remaining_retries=remaining_retries,
|
||||||
max_retries=num_retries,
|
max_retries=num_retries,
|
||||||
min_timeout=self.retry_after,
|
min_timeout=self.retry_after,
|
||||||
)
|
)
|
||||||
|
|
||||||
return timeout
|
return timeout
|
||||||
|
|
||||||
def function_with_retries(self, *args, **kwargs):
|
def function_with_retries(self, *args, **kwargs):
|
||||||
|
@ -2997,8 +3007,9 @@ class Router:
|
||||||
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
||||||
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
||||||
|
|
||||||
exception_response = getattr(exception, "response", {})
|
exception_headers = litellm.utils._get_litellm_response_headers(
|
||||||
exception_headers = getattr(exception_response, "headers", None)
|
original_exception=exception
|
||||||
|
)
|
||||||
_time_to_cooldown = kwargs.get("litellm_params", {}).get(
|
_time_to_cooldown = kwargs.get("litellm_params", {}).get(
|
||||||
"cooldown_time", self.cooldown_time
|
"cooldown_time", self.cooldown_time
|
||||||
)
|
)
|
||||||
|
@ -4744,8 +4755,13 @@ class Router:
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
raise ValueError(
|
_cooldown_time = self.cooldown_time # [TODO] Make dynamic
|
||||||
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, cooldown_list={self._get_cooldown_deployments()}"
|
_cooldown_list = self._get_cooldown_deployments()
|
||||||
|
raise RouterRateLimitError(
|
||||||
|
model=model,
|
||||||
|
cooldown_time=_cooldown_time,
|
||||||
|
enable_pre_call_checks=self.enable_pre_call_checks,
|
||||||
|
cooldown_list=_cooldown_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
|
if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
|
||||||
|
|
|
@ -10,6 +10,9 @@ import traceback
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import litellm.types
|
||||||
|
import litellm.types.router
|
||||||
|
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the parent directory to the system path
|
) # Adds the parent directory to the system path
|
||||||
|
@ -2184,3 +2187,126 @@ def test_router_correctly_reraise_error():
|
||||||
)
|
)
|
||||||
except litellm.RateLimitError:
|
except litellm.RateLimitError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_dynamic_cooldown_correct_retry_after_time():
|
||||||
|
"""
|
||||||
|
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||||
|
but Azure says to retry in at most 9s
|
||||||
|
|
||||||
|
```
|
||||||
|
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "text-embedding-ada-002",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/text-embedding-ada-002",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
openai_client = openai.OpenAI(api_key="")
|
||||||
|
|
||||||
|
cooldown_time = 30.0
|
||||||
|
|
||||||
|
def _return_exception(*args, **kwargs):
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail="Rate Limited!",
|
||||||
|
headers={"retry-after": cooldown_time},
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
openai_client.embeddings.with_raw_response,
|
||||||
|
"create",
|
||||||
|
side_effect=_return_exception,
|
||||||
|
):
|
||||||
|
new_retry_after_mock_client = MagicMock(return_value=-1)
|
||||||
|
|
||||||
|
litellm.utils._get_retry_after_from_exception_header = (
|
||||||
|
new_retry_after_mock_client
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
router.embedding(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Hello world!",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
except litellm.RateLimitError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
new_retry_after_mock_client.assert_called()
|
||||||
|
print(
|
||||||
|
f"new_retry_after_mock_client.call_args.kwargs: {new_retry_after_mock_client.call_args.kwargs}"
|
||||||
|
)
|
||||||
|
|
||||||
|
response_headers: httpx.Headers = new_retry_after_mock_client.call_args.kwargs[
|
||||||
|
"response_headers"
|
||||||
|
]
|
||||||
|
assert "retry-after" in response_headers
|
||||||
|
assert response_headers["retry-after"] == cooldown_time
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_dynamic_cooldown_message_retry_time():
|
||||||
|
"""
|
||||||
|
User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
|
||||||
|
but Azure says to retry in at most 9s
|
||||||
|
|
||||||
|
```
|
||||||
|
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "text-embedding-ada-002",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/text-embedding-ada-002",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
openai_client = openai.OpenAI(api_key="")
|
||||||
|
|
||||||
|
cooldown_time = 30.0
|
||||||
|
|
||||||
|
def _return_exception(*args, **kwargs):
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail="Rate Limited!",
|
||||||
|
headers={"retry-after": cooldown_time},
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
openai_client.embeddings.with_raw_response,
|
||||||
|
"create",
|
||||||
|
side_effect=_return_exception,
|
||||||
|
):
|
||||||
|
for _ in range(2):
|
||||||
|
try:
|
||||||
|
router.embedding(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Hello world!",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
except litellm.RateLimitError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
router.embedding(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Hello world!",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
except litellm.types.router.RouterRateLimitError as e:
|
||||||
|
assert e.cooldown_time == cooldown_time
|
||||||
|
|
|
@ -549,3 +549,19 @@ class RouterGeneralSettings(BaseModel):
|
||||||
pass_through_all_models: bool = Field(
|
pass_through_all_models: bool = Field(
|
||||||
default=False
|
default=False
|
||||||
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
|
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
|
||||||
|
|
||||||
|
|
||||||
|
class RouterRateLimitError(ValueError):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
cooldown_time: float,
|
||||||
|
enable_pre_call_checks: bool,
|
||||||
|
cooldown_list: List,
|
||||||
|
):
|
||||||
|
self.model = model
|
||||||
|
self.cooldown_time = cooldown_time
|
||||||
|
self.enable_pre_call_checks = enable_pre_call_checks
|
||||||
|
self.cooldown_list = cooldown_list
|
||||||
|
_message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
|
||||||
|
super().__init__(_message)
|
||||||
|
|
|
@ -6339,6 +6339,7 @@ def _get_retry_after_from_exception_header(
|
||||||
retry_after = int(retry_date - time.time())
|
retry_after = int(retry_date - time.time())
|
||||||
else:
|
else:
|
||||||
retry_after = -1
|
retry_after = -1
|
||||||
|
|
||||||
return retry_after
|
return retry_after
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -6520,6 +6521,40 @@ def get_model_list():
|
||||||
|
|
||||||
|
|
||||||
####### EXCEPTION MAPPING ################
|
####### EXCEPTION MAPPING ################
|
||||||
|
def _get_litellm_response_headers(
|
||||||
|
original_exception: Exception,
|
||||||
|
) -> Optional[httpx.Headers]:
|
||||||
|
"""
|
||||||
|
Extract and return the response headers from a mapped exception, if present.
|
||||||
|
|
||||||
|
Used for accurate retry logic.
|
||||||
|
"""
|
||||||
|
_response_headers: Optional[httpx.Headers] = None
|
||||||
|
try:
|
||||||
|
_response_headers = getattr(
|
||||||
|
original_exception, "litellm_response_headers", None
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return _response_headers
|
||||||
|
|
||||||
|
|
||||||
|
def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
|
||||||
|
"""
|
||||||
|
Extract and return the response headers from an exception, if present.
|
||||||
|
|
||||||
|
Used for accurate retry logic.
|
||||||
|
"""
|
||||||
|
_response_headers: Optional[httpx.Headers] = None
|
||||||
|
try:
|
||||||
|
_response_headers = getattr(original_exception, "headers", None)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return _response_headers
|
||||||
|
|
||||||
|
|
||||||
def exception_type(
|
def exception_type(
|
||||||
model,
|
model,
|
||||||
original_exception,
|
original_exception,
|
||||||
|
@ -6544,6 +6579,10 @@ def exception_type(
|
||||||
"LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa
|
"LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa
|
||||||
) # noqa
|
) # noqa
|
||||||
print() # noqa
|
print() # noqa
|
||||||
|
|
||||||
|
litellm_response_headers = _get_response_headers(
|
||||||
|
original_exception=original_exception
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
if model:
|
if model:
|
||||||
if hasattr(original_exception, "message"):
|
if hasattr(original_exception, "message"):
|
||||||
|
@ -8422,20 +8461,20 @@ def exception_type(
|
||||||
threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
|
threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
|
||||||
# don't let an error with mapping interrupt the user from receiving an error from the llm api calls
|
# don't let an error with mapping interrupt the user from receiving an error from the llm api calls
|
||||||
if exception_mapping_worked:
|
if exception_mapping_worked:
|
||||||
|
setattr(e, "litellm_response_headers", litellm_response_headers)
|
||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
for error_type in litellm.LITELLM_EXCEPTION_TYPES:
|
for error_type in litellm.LITELLM_EXCEPTION_TYPES:
|
||||||
if isinstance(e, error_type):
|
if isinstance(e, error_type):
|
||||||
|
setattr(e, "litellm_response_headers", litellm_response_headers)
|
||||||
raise e # it's already mapped
|
raise e # it's already mapped
|
||||||
raise APIConnectionError(
|
raised_exc = APIConnectionError(
|
||||||
message="{}\n{}".format(original_exception, traceback.format_exc()),
|
message="{}\n{}".format(original_exception, traceback.format_exc()),
|
||||||
llm_provider="",
|
llm_provider="",
|
||||||
model="",
|
model="",
|
||||||
request=httpx.Request(
|
|
||||||
method="POST",
|
|
||||||
url="https://www.litellm.ai/",
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
setattr(raised_exc, "litellm_response_headers", _response_headers)
|
||||||
|
raise raised_exc
|
||||||
|
|
||||||
|
|
||||||
######### Secret Manager ############################
|
######### Secret Manager ############################
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue