From 160acc085a95be55dd73109fd7593f7438a61259 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Apr 2024 11:57:27 -0700
Subject: [PATCH 1/5] fix(router.py): fix default retry logic

---
 .gitignore                              |  1 +
 litellm/llms/openai.py                  |  1 +
 litellm/proxy/_super_secret_config.yaml | 47 ++-----------------------
 litellm/router.py                       | 24 +++++++++----
 litellm/tests/test_router.py            | 41 ++++++++++++++++++++-
 litellm/types/router.py                 |  6 ++--
 6 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/.gitignore b/.gitignore
index 357f3e1bf..abc4ecb0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ loadtest_kub.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
+litellm/proxy/_super_secret_config.yaml
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index e3c012dab..f68ab235e 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
                 )
             else:
                 openai_aclient = client
+
             ## LOGGING
             logging_obj.pre_call(
                 input=data["messages"],
diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml
index 9372d4ca8..bccc69e19 100644
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@@ -1,51 +1,8 @@
-environment_variables:
-  SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
-general_settings:
-  alerting:
-  - slack
-  alerting_threshold: 300
-  database_connection_pool_limit: 100
-  database_connection_timeout: 60
-  health_check_interval: 300
-  proxy_batch_write_at: 10
-  ui_access_mode: all
-litellm_settings:
-  allowed_fails: 3
-  failure_callback:
-  - prometheus
-  fallbacks:
-  - gpt-3.5-turbo:
-    - fake-openai-endpoint
-    - gpt-4
-  num_retries: 3
-  service_callback:
-  - prometheus_system
-  success_callback:
-  - prometheus
 model_list:
 - litellm_params:
-    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_base: http://0.0.0.0:8080
     api_key: my-fake-key
     model: openai/my-fake-model
   model_name: fake-openai-endpoint
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_name: gpt-3.5-turbo
-- model_name: llama-3
-  litellm_params:
-    model: replicate/meta/meta-llama-3-8b-instruct
 router_settings:
-  allowed_fails: 3
-  context_window_fallbacks: null
-  cooldown_time: 1
-  fallbacks:
-  - gpt-3.5-turbo:
-    - fake-openai-endpoint
-    - gpt-4
-  - gpt-3.5-turbo-3:
-    - fake-openai-endpoint
-  num_retries: 3
-  retry_after: 0
-  routing_strategy: simple-shuffle
-  routing_strategy_args: {}
-  timeout: 6000
+  num_retries: 0
diff --git a/litellm/router.py b/litellm/router.py
index 371d8e8eb..1c2bb4464 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -50,7 +50,7 @@ class Router:
     model_names: List = []
     cache_responses: Optional[bool] = False
     default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
-    num_retries: int = 0
+    num_retries: int = openai.DEFAULT_MAX_RETRIES
     tenacity = None
     leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
     lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
@@ -70,7 +70,7 @@ class Router:
         ] = None,  # if you want to cache across model groups
         client_ttl: int = 3600,  # ttl for cached clients - will re-initialize after this time in seconds
         ## RELIABILITY ##
-        num_retries: int = 0,
+        num_retries: Optional[int] = None,
         timeout: Optional[float] = None,
         default_litellm_params={},  # default params for Router.chat.completion.create
         default_max_parallel_requests: Optional[int] = None,
@@ -229,7 +229,12 @@ class Router:
         self.failed_calls = (
             InMemoryCache()
         )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
-        self.num_retries = num_retries or litellm.num_retries or 0
+
+        if num_retries is not None:
+            self.num_retries = num_retries
+        elif litellm.num_retries is not None:
+            self.num_retries = litellm.num_retries
+
         self.timeout = timeout or litellm.request_timeout
 
         self.retry_after = retry_after
@@ -428,6 +433,7 @@ class Router:
             kwargs["messages"] = messages
             kwargs["original_function"] = self._acompletion
             kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
+
             timeout = kwargs.get("request_timeout", self.timeout)
             kwargs.setdefault("metadata", {}).update({"model_group": model})
 
@@ -1415,10 +1421,12 @@ class Router:
         context_window_fallbacks = kwargs.pop(
             "context_window_fallbacks", self.context_window_fallbacks
         )
-        verbose_router_logger.debug(
-            f"async function w/ retries: original_function - {original_function}"
-        )
+
         num_retries = kwargs.pop("num_retries")
+
+        verbose_router_logger.debug(
+            f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
+        )
         try:
             # if the function call is successful, no exception will be raised and we'll break out of the loop
             response = await original_function(*args, **kwargs)
@@ -1986,7 +1994,9 @@ class Router:
                 stream_timeout = litellm.get_secret(stream_timeout_env_name)
                 litellm_params["stream_timeout"] = stream_timeout
 
-            max_retries = litellm_params.pop("max_retries", 2)
+            max_retries = litellm_params.pop(
+                "max_retries", 0
+            )  # router handles retry logic
             if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
                 max_retries_env_name = max_retries.replace("os.environ/", "")
                 max_retries = litellm.get_secret(max_retries_env_name)
diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py
index 7beb1d67c..ed486d6f5 100644
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@@ -1,7 +1,7 @@
 #### What this tests ####
 # This tests litellm router
 
-import sys, os, time
+import sys, os, time, openai
 import traceback, asyncio
 import pytest
 
@@ -18,6 +18,45 @@ from dotenv import load_dotenv
 load_dotenv()
 
 
+@pytest.mark.parametrize("num_retries", [None, 2])
+@pytest.mark.parametrize("max_retries", [None, 4])
+def test_router_num_retries_init(num_retries, max_retries):
+    """
+    - test when num_retries set v/s not
+    - test client value when max retries set v/s not
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": "bad-key",
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "max_retries": max_retries,
+                },
+                "model_info": {"id": 12345},
+            },
+        ],
+        num_retries=num_retries,
+    )
+
+    if num_retries is not None:
+        assert router.num_retries == num_retries
+    else:
+        assert router.num_retries == openai.DEFAULT_MAX_RETRIES
+
+    model_client = router._get_client(
+        {"model_info": {"id": 12345}}, client_type="async", kwargs={}
+    )
+
+    if max_retries is not None:
+        assert getattr(model_client, "max_retries") == max_retries
+    else:
+        assert getattr(model_client, "max_retries") == 0
+
+
 def test_exception_raising():
     # this tests if the router raises an exception when invalid params are set
     # in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
diff --git a/litellm/types/router.py b/litellm/types/router.py
index c5ec47091..1bd8bda97 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -108,7 +108,7 @@ class LiteLLM_Params(BaseModel):
     stream_timeout: Optional[Union[float, str]] = (
         None  # timeout when making stream=True calls, if str, pass in as os.environ/
     )
-    max_retries: int = 2  # follows openai default of 2
+    max_retries: Optional[int] = None
     organization: Optional[str] = None  # for openai orgs
     ## VERTEX AI ##
     vertex_project: Optional[str] = None
@@ -146,9 +146,7 @@ class LiteLLM_Params(BaseModel):
         args.pop("self", None)
         args.pop("params", None)
         args.pop("__class__", None)
-        if max_retries is None:
-            max_retries = 2
-        elif isinstance(max_retries, str):
+        if max_retries is not None and isinstance(max_retries, str):
             max_retries = int(max_retries)  # cast to int
         super().__init__(max_retries=max_retries, **args, **params)
 

From a81945464702e708432b04716040b9bea0f636d8 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Apr 2024 13:31:19 -0700
Subject: [PATCH 2/5] test(test_completion.py): fix test to not raise exception
 if it works

---
 litellm/tests/test_completion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 1f12f75ee..1d30f8829 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -1781,7 +1781,6 @@ def test_completion_replicate_llama3():
         print("RESPONSE STRING\n", response_str)
         if type(response_str) != str:
             pytest.fail(f"Error occurred: {e}")
-        raise Exception("it worked!")
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 

From 54241f25516013f06d016aa21ac4703f78275d42 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Apr 2024 17:43:40 -0700
Subject: [PATCH 3/5] test(test_router_fallbacks.py): fix testing

---
 litellm/llms/prompt_templates/factory.py | 5 +----
 litellm/tests/test_router_fallbacks.py   | 3 ++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index a6d1d6438..1a576f43a 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -1359,11 +1359,8 @@ def prompt_factory(
             "meta-llama/llama-3" in model or "meta-llama-3" in model
         ) and "instruct" in model:
             return hf_chat_template(
-                model=model,
+                model="meta-llama/Meta-Llama-3-8B-Instruct",
                 messages=messages,
-                chat_template=known_tokenizer_config[  # type: ignore
-                    "meta-llama/Meta-Llama-3-8B-Instruct"
-                ]["tokenizer"]["chat_template"],
             )
         elif (
             "tiiuae/falcon" in model
diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py
index 98a2449f0..51d9451a8 100644
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@@ -258,6 +258,7 @@ def test_sync_fallbacks_embeddings():
         model_list=model_list,
         fallbacks=[{"bad-azure-embedding-model": ["good-azure-embedding-model"]}],
         set_verbose=False,
+        num_retries=0,
     )
     customHandler = MyCustomHandler()
     litellm.callbacks = [customHandler]
@@ -393,7 +394,7 @@ def test_dynamic_fallbacks_sync():
             },
         ]
 
-        router = Router(model_list=model_list, set_verbose=True)
+        router = Router(model_list=model_list, set_verbose=True, num_retries=0)
         kwargs = {}
         kwargs["model"] = "azure/gpt-3.5-turbo"
         kwargs["messages"] = [{"role": "user", "content": "Hey, how's it going?"}]

From 19852310220fe8327f60f81753972de30d6e4885 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Apr 2024 18:06:25 -0700
Subject: [PATCH 4/5] test(test_timeout.py): explicitly set num retries = 0

---
 litellm/tests/test_timeout.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py
index 8c92607c0..259689167 100644
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@@ -78,7 +78,8 @@ def test_hanging_request_azure():
                     "model_name": "openai-gpt",
                     "litellm_params": {"model": "gpt-3.5-turbo"},
                 },
-            ]
+            ],
+            num_retries=0,
         )
 
         encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]

From 4c5398b556fbedfdf4389ec23e6af53ac389ff97 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Apr 2024 19:35:30 -0700
Subject: [PATCH 5/5] test(test_timeout.py): fix test

---
 litellm/tests/test_timeout.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py
index 259689167..d38da52e5 100644
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@@ -132,7 +132,8 @@ def test_hanging_request_openai():
                     "model_name": "openai-gpt",
                     "litellm_params": {"model": "gpt-3.5-turbo"},
                 },
-            ]
+            ],
+            num_retries=0,
         )
 
         encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@@ -190,6 +191,7 @@ def test_timeout_streaming():
 # test_timeout_streaming()
 
 
+@pytest.mark.skip(reason="local test")
 def test_timeout_ollama():
     # this Will Raise a timeout
     import litellm