forked from phoenix/litellm-mirror
fix(router.py): fix default retry logic
This commit is contained in:
parent
5ad91e1277
commit
160acc085a
6 changed files with 63 additions and 57 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -51,3 +51,4 @@ loadtest_kub.yaml
|
|||
litellm/proxy/_new_secret_config.yaml
|
||||
litellm/proxy/_new_secret_config.yaml
|
||||
litellm/proxy/_super_secret_config.yaml
|
||||
litellm/proxy/_super_secret_config.yaml
|
||||
|
|
|
@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data["messages"],
|
||||
|
|
|
@ -1,51 +1,8 @@
|
|||
environment_variables:
|
||||
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
|
||||
general_settings:
|
||||
alerting:
|
||||
- slack
|
||||
alerting_threshold: 300
|
||||
database_connection_pool_limit: 100
|
||||
database_connection_timeout: 60
|
||||
health_check_interval: 300
|
||||
proxy_batch_write_at: 10
|
||||
ui_access_mode: all
|
||||
litellm_settings:
|
||||
allowed_fails: 3
|
||||
failure_callback:
|
||||
- prometheus
|
||||
fallbacks:
|
||||
- gpt-3.5-turbo:
|
||||
- fake-openai-endpoint
|
||||
- gpt-4
|
||||
num_retries: 3
|
||||
service_callback:
|
||||
- prometheus_system
|
||||
success_callback:
|
||||
- prometheus
|
||||
model_list:
|
||||
- litellm_params:
|
||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||
api_base: http://0.0.0.0:8080
|
||||
api_key: my-fake-key
|
||||
model: openai/my-fake-model
|
||||
model_name: fake-openai-endpoint
|
||||
- litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
model_name: gpt-3.5-turbo
|
||||
- model_name: llama-3
|
||||
litellm_params:
|
||||
model: replicate/meta/meta-llama-3-8b-instruct
|
||||
router_settings:
|
||||
allowed_fails: 3
|
||||
context_window_fallbacks: null
|
||||
cooldown_time: 1
|
||||
fallbacks:
|
||||
- gpt-3.5-turbo:
|
||||
- fake-openai-endpoint
|
||||
- gpt-4
|
||||
- gpt-3.5-turbo-3:
|
||||
- fake-openai-endpoint
|
||||
num_retries: 3
|
||||
retry_after: 0
|
||||
routing_strategy: simple-shuffle
|
||||
routing_strategy_args: {}
|
||||
timeout: 6000
|
||||
num_retries: 0
|
||||
|
|
|
@ -50,7 +50,7 @@ class Router:
|
|||
model_names: List = []
|
||||
cache_responses: Optional[bool] = False
|
||||
default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour
|
||||
num_retries: int = 0
|
||||
num_retries: int = openai.DEFAULT_MAX_RETRIES
|
||||
tenacity = None
|
||||
leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
|
||||
lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
|
||||
|
@ -70,7 +70,7 @@ class Router:
|
|||
] = None, # if you want to cache across model groups
|
||||
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
|
||||
## RELIABILITY ##
|
||||
num_retries: int = 0,
|
||||
num_retries: Optional[int] = None,
|
||||
timeout: Optional[float] = None,
|
||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||
default_max_parallel_requests: Optional[int] = None,
|
||||
|
@ -229,7 +229,12 @@ class Router:
|
|||
self.failed_calls = (
|
||||
InMemoryCache()
|
||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||
self.num_retries = num_retries or litellm.num_retries or 0
|
||||
|
||||
if num_retries is not None:
|
||||
self.num_retries = num_retries
|
||||
elif litellm.num_retries is not None:
|
||||
self.num_retries = litellm.num_retries
|
||||
|
||||
self.timeout = timeout or litellm.request_timeout
|
||||
|
||||
self.retry_after = retry_after
|
||||
|
@ -428,6 +433,7 @@ class Router:
|
|||
kwargs["messages"] = messages
|
||||
kwargs["original_function"] = self._acompletion
|
||||
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
|
||||
|
||||
timeout = kwargs.get("request_timeout", self.timeout)
|
||||
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||
|
||||
|
@ -1415,10 +1421,12 @@ class Router:
|
|||
context_window_fallbacks = kwargs.pop(
|
||||
"context_window_fallbacks", self.context_window_fallbacks
|
||||
)
|
||||
verbose_router_logger.debug(
|
||||
f"async function w/ retries: original_function - {original_function}"
|
||||
)
|
||||
|
||||
num_retries = kwargs.pop("num_retries")
|
||||
|
||||
verbose_router_logger.debug(
|
||||
f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
|
||||
)
|
||||
try:
|
||||
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
||||
response = await original_function(*args, **kwargs)
|
||||
|
@ -1986,7 +1994,9 @@ class Router:
|
|||
stream_timeout = litellm.get_secret(stream_timeout_env_name)
|
||||
litellm_params["stream_timeout"] = stream_timeout
|
||||
|
||||
max_retries = litellm_params.pop("max_retries", 2)
|
||||
max_retries = litellm_params.pop(
|
||||
"max_retries", 0
|
||||
) # router handles retry logic
|
||||
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
|
||||
max_retries_env_name = max_retries.replace("os.environ/", "")
|
||||
max_retries = litellm.get_secret(max_retries_env_name)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#### What this tests ####
|
||||
# This tests litellm router
|
||||
|
||||
import sys, os, time
|
||||
import sys, os, time, openai
|
||||
import traceback, asyncio
|
||||
import pytest
|
||||
|
||||
|
@ -18,6 +18,45 @@ from dotenv import load_dotenv
|
|||
load_dotenv()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_retries", [None, 2])
|
||||
@pytest.mark.parametrize("max_retries", [None, 4])
|
||||
def test_router_num_retries_init(num_retries, max_retries):
|
||||
"""
|
||||
- test when num_retries set v/s not
|
||||
- test client value when max retries set v/s not
|
||||
"""
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": "bad-key",
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"max_retries": max_retries,
|
||||
},
|
||||
"model_info": {"id": 12345},
|
||||
},
|
||||
],
|
||||
num_retries=num_retries,
|
||||
)
|
||||
|
||||
if num_retries is not None:
|
||||
assert router.num_retries == num_retries
|
||||
else:
|
||||
assert router.num_retries == openai.DEFAULT_MAX_RETRIES
|
||||
|
||||
model_client = router._get_client(
|
||||
{"model_info": {"id": 12345}}, client_type="async", kwargs={}
|
||||
)
|
||||
|
||||
if max_retries is not None:
|
||||
assert getattr(model_client, "max_retries") == max_retries
|
||||
else:
|
||||
assert getattr(model_client, "max_retries") == 0
|
||||
|
||||
|
||||
def test_exception_raising():
|
||||
# this tests if the router raises an exception when invalid params are set
|
||||
# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
|
||||
|
|
|
@ -108,7 +108,7 @@ class LiteLLM_Params(BaseModel):
|
|||
stream_timeout: Optional[Union[float, str]] = (
|
||||
None # timeout when making stream=True calls, if str, pass in as os.environ/
|
||||
)
|
||||
max_retries: int = 2 # follows openai default of 2
|
||||
max_retries: Optional[int] = None
|
||||
organization: Optional[str] = None # for openai orgs
|
||||
## VERTEX AI ##
|
||||
vertex_project: Optional[str] = None
|
||||
|
@ -146,9 +146,7 @@ class LiteLLM_Params(BaseModel):
|
|||
args.pop("self", None)
|
||||
args.pop("params", None)
|
||||
args.pop("__class__", None)
|
||||
if max_retries is None:
|
||||
max_retries = 2
|
||||
elif isinstance(max_retries, str):
|
||||
if max_retries is not None and isinstance(max_retries, str):
|
||||
max_retries = int(max_retries) # cast to int
|
||||
super().__init__(max_retries=max_retries, **args, **params)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue