From 160acc085a95be55dd73109fd7593f7438a61259 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Apr 2024 11:57:27 -0700 Subject: [PATCH 1/5] fix(router.py): fix default retry logic --- .gitignore | 1 + litellm/llms/openai.py | 1 + litellm/proxy/_super_secret_config.yaml | 47 ++----------------------- litellm/router.py | 24 +++++++++---- litellm/tests/test_router.py | 41 ++++++++++++++++++++- litellm/types/router.py | 6 ++-- 6 files changed, 63 insertions(+), 57 deletions(-) diff --git a/.gitignore b/.gitignore index 357f3e1bf..abc4ecb0c 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ loadtest_kub.yaml litellm/proxy/_new_secret_config.yaml litellm/proxy/_new_secret_config.yaml litellm/proxy/_super_secret_config.yaml +litellm/proxy/_super_secret_config.yaml diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index e3c012dab..f68ab235e 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM): ) else: openai_aclient = client + ## LOGGING logging_obj.pre_call( input=data["messages"], diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 9372d4ca8..bccc69e19 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -1,51 +1,8 @@ -environment_variables: - SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg== -general_settings: - alerting: - - slack - alerting_threshold: 300 - database_connection_pool_limit: 100 - database_connection_timeout: 60 - health_check_interval: 300 - proxy_batch_write_at: 10 - ui_access_mode: all -litellm_settings: - allowed_fails: 3 - failure_callback: - - prometheus - fallbacks: - - gpt-3.5-turbo: - - fake-openai-endpoint - - gpt-4 - num_retries: 3 - service_callback: - - prometheus_system - success_callback: - - prometheus model_list: - litellm_params: - api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + api_base: http://0.0.0.0:8080 api_key: my-fake-key model: openai/my-fake-model model_name: fake-openai-endpoint -- litellm_params: - model: gpt-3.5-turbo - model_name: gpt-3.5-turbo -- model_name: llama-3 - litellm_params: - model: replicate/meta/meta-llama-3-8b-instruct router_settings: - allowed_fails: 3 - context_window_fallbacks: null - cooldown_time: 1 - fallbacks: - - gpt-3.5-turbo: - - fake-openai-endpoint - - gpt-4 - - gpt-3.5-turbo-3: - - fake-openai-endpoint - num_retries: 3 - retry_after: 0 - routing_strategy: simple-shuffle - routing_strategy_args: {} - timeout: 6000 + num_retries: 0 diff --git a/litellm/router.py b/litellm/router.py index 371d8e8eb..1c2bb4464 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -50,7 +50,7 @@ class Router: model_names: List = [] cache_responses: Optional[bool] = False default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour - num_retries: int = 0 + num_retries: int = openai.DEFAULT_MAX_RETRIES tenacity = None leastbusy_logger: Optional[LeastBusyLoggingHandler] = None lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None @@ -70,7 +70,7 @@ class Router: ] = None, # if you want to cache across model groups client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds ## RELIABILITY ## - num_retries: int = 0, + num_retries: Optional[int] = None, timeout: Optional[float] = None, default_litellm_params={}, # default params for Router.chat.completion.create default_max_parallel_requests: Optional[int] = None, @@ -229,7 +229,12 @@ class Router: self.failed_calls = ( InMemoryCache() ) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown - self.num_retries = num_retries or litellm.num_retries or 0 + + if num_retries is not None: + self.num_retries = num_retries + elif litellm.num_retries is not None: + self.num_retries = litellm.num_retries + self.timeout = timeout or litellm.request_timeout self.retry_after = retry_after @@ -428,6 +433,7 @@ class Router: kwargs["messages"] = messages kwargs["original_function"] = self._acompletion kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries) + timeout = kwargs.get("request_timeout", self.timeout) kwargs.setdefault("metadata", {}).update({"model_group": model}) @@ -1415,10 +1421,12 @@ class Router: context_window_fallbacks = kwargs.pop( "context_window_fallbacks", self.context_window_fallbacks ) - verbose_router_logger.debug( - f"async function w/ retries: original_function - {original_function}" - ) + num_retries = kwargs.pop("num_retries") + + verbose_router_logger.debug( + f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}" + ) try: # if the function call is successful, no exception will be raised and we'll break out of the loop response = await original_function(*args, **kwargs) @@ -1986,7 +1994,9 @@ class Router: stream_timeout = litellm.get_secret(stream_timeout_env_name) litellm_params["stream_timeout"] = stream_timeout - max_retries = litellm_params.pop("max_retries", 2) + max_retries = litellm_params.pop( + "max_retries", 0 + ) # router handles retry logic if isinstance(max_retries, str) and max_retries.startswith("os.environ/"): max_retries_env_name = max_retries.replace("os.environ/", "") max_retries = litellm.get_secret(max_retries_env_name) diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py index 7beb1d67c..ed486d6f5 100644 --- a/litellm/tests/test_router.py +++ b/litellm/tests/test_router.py @@ -1,7 +1,7 @@ #### What this tests #### # This tests litellm router -import sys, os, time +import sys, os, time, openai import traceback, asyncio import pytest @@ -18,6 +18,45 @@ from dotenv import load_dotenv load_dotenv() +@pytest.mark.parametrize("num_retries", [None, 2]) +@pytest.mark.parametrize("max_retries", [None, 4]) +def test_router_num_retries_init(num_retries, max_retries): + """ + - test when num_retries set v/s not + - test client value when max retries set v/s not + """ + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": "bad-key", + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "max_retries": max_retries, + }, + "model_info": {"id": 12345}, + }, + ], + num_retries=num_retries, + ) + + if num_retries is not None: + assert router.num_retries == num_retries + else: + assert router.num_retries == openai.DEFAULT_MAX_RETRIES + + model_client = router._get_client( + {"model_info": {"id": 12345}}, client_type="async", kwargs={} + ) + + if max_retries is not None: + assert getattr(model_client, "max_retries") == max_retries + else: + assert getattr(model_client, "max_retries") == 0 + + def test_exception_raising(): # this tests if the router raises an exception when invalid params are set # in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception diff --git a/litellm/types/router.py b/litellm/types/router.py index c5ec47091..1bd8bda97 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -108,7 +108,7 @@ class LiteLLM_Params(BaseModel): stream_timeout: Optional[Union[float, str]] = ( None # timeout when making stream=True calls, if str, pass in as os.environ/ ) - max_retries: int = 2 # follows openai default of 2 + max_retries: Optional[int] = None organization: Optional[str] = None # for openai orgs ## VERTEX AI ## vertex_project: Optional[str] = None @@ -146,9 +146,7 @@ class LiteLLM_Params(BaseModel): args.pop("self", None) args.pop("params", None) args.pop("__class__", None) - if max_retries is None: - max_retries = 2 - elif isinstance(max_retries, str): + if max_retries is not None and isinstance(max_retries, str): max_retries = int(max_retries) # cast to int super().__init__(max_retries=max_retries, **args, **params) From a81945464702e708432b04716040b9bea0f636d8 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Apr 2024 13:31:19 -0700 Subject: [PATCH 2/5] test(test_completion.py): fix test to not raise exception if it works --- litellm/tests/test_completion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 1f12f75ee..1d30f8829 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1781,7 +1781,6 @@ def test_completion_replicate_llama3(): print("RESPONSE STRING\n", response_str) if type(response_str) != str: pytest.fail(f"Error occurred: {e}") - raise Exception("it worked!") except Exception as e: pytest.fail(f"Error occurred: {e}") From 54241f25516013f06d016aa21ac4703f78275d42 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Apr 2024 17:43:40 -0700 Subject: [PATCH 3/5] test(test_router_fallbacks.py): fix testing --- litellm/llms/prompt_templates/factory.py | 5 +---- litellm/tests/test_router_fallbacks.py | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index a6d1d6438..1a576f43a 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -1359,11 +1359,8 @@ def prompt_factory( "meta-llama/llama-3" in model or "meta-llama-3" in model ) and "instruct" in model: return hf_chat_template( - model=model, + model="meta-llama/Meta-Llama-3-8B-Instruct", messages=messages, - chat_template=known_tokenizer_config[ # type: ignore - "meta-llama/Meta-Llama-3-8B-Instruct" - ]["tokenizer"]["chat_template"], ) elif ( "tiiuae/falcon" in model diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index 98a2449f0..51d9451a8 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -258,6 +258,7 @@ def test_sync_fallbacks_embeddings(): model_list=model_list, fallbacks=[{"bad-azure-embedding-model": ["good-azure-embedding-model"]}], set_verbose=False, + num_retries=0, ) customHandler = MyCustomHandler() litellm.callbacks = [customHandler] @@ -393,7 +394,7 @@ def test_dynamic_fallbacks_sync(): }, ] - router = Router(model_list=model_list, set_verbose=True) + router = Router(model_list=model_list, set_verbose=True, num_retries=0) kwargs = {} kwargs["model"] = "azure/gpt-3.5-turbo" kwargs["messages"] = [{"role": "user", "content": "Hey, how's it going?"}] From 19852310220fe8327f60f81753972de30d6e4885 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Apr 2024 18:06:25 -0700 Subject: [PATCH 4/5] test(test_timeout.py): explicitly set num retries = 0 --- litellm/tests/test_timeout.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py index 8c92607c0..259689167 100644 --- a/litellm/tests/test_timeout.py +++ b/litellm/tests/test_timeout.py @@ -78,7 +78,8 @@ def test_hanging_request_azure(): "model_name": "openai-gpt", "litellm_params": {"model": "gpt-3.5-turbo"}, }, - ] + ], + num_retries=0, ) encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0] From 4c5398b556fbedfdf4389ec23e6af53ac389ff97 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Apr 2024 19:35:30 -0700 Subject: [PATCH 5/5] test(test_timeout.py): fix test --- litellm/tests/test_timeout.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py index 259689167..d38da52e5 100644 --- a/litellm/tests/test_timeout.py +++ b/litellm/tests/test_timeout.py @@ -132,7 +132,8 @@ def test_hanging_request_openai(): "model_name": "openai-gpt", "litellm_params": {"model": "gpt-3.5-turbo"}, }, - ] + ], + num_retries=0, ) encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0] @@ -190,6 +191,7 @@ def test_timeout_streaming(): # test_timeout_streaming() +@pytest.mark.skip(reason="local test") def test_timeout_ollama(): # this Will Raise a timeout import litellm