From 5488bf4921b44bf3865875c0eb82c0b8b68c9a24 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 17:49:56 -0700
Subject: [PATCH 01/10] feat(router.py): enable default fallbacks

allow user to define a generic list of fallbacks, in case a new deployment is bad

Closes https://github.com/BerriAI/litellm/issues/3623
---
 litellm/router.py                      | 53 +++++++++++++++++++++-----
 litellm/tests/test_router_fallbacks.py | 47 +++++++++++++++++++++++
 litellm/types/router.py                | 43 ++++++++++++++++++++-
 3 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/litellm/router.py b/litellm/router.py
index e524937ae..fe8709294 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -9,7 +9,7 @@
 
 import copy, httpx
 from datetime import datetime
-from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO, Tuple
+from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO, Tuple, TypedDict
 from typing_extensions import overload
 import random, threading, time, traceback, uuid
 import litellm, openai, hashlib, json
@@ -47,6 +47,7 @@ from litellm.types.router import (
     updateLiteLLMParams,
     RetryPolicy,
     AlertingConfig,
+    DeploymentTypedDict,
 )
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.azure import get_azure_ad_token_from_oidc
@@ -62,7 +63,7 @@ class Router:
 
     def __init__(
         self,
-        model_list: Optional[list] = None,
+        model_list: Optional[List[DeploymentTypedDict]] = None,
         ## CACHING ##
         redis_url: Optional[str] = None,
         redis_host: Optional[str] = None,
@@ -83,6 +84,9 @@ class Router:
         default_max_parallel_requests: Optional[int] = None,
         set_verbose: bool = False,
         debug_level: Literal["DEBUG", "INFO"] = "INFO",
+        default_fallbacks: Optional[
+            List[str]
+        ] = None,  # generic fallbacks, works across all deployments
         fallbacks: List = [],
         context_window_fallbacks: List = [],
         model_group_alias: Optional[dict] = {},
@@ -259,6 +263,11 @@ class Router:
         self.retry_after = retry_after
         self.routing_strategy = routing_strategy
         self.fallbacks = fallbacks or litellm.fallbacks
+        if default_fallbacks is not None:
+            if self.fallbacks is not None:
+                self.fallbacks.append({"*": default_fallbacks})
+            else:
+                self.fallbacks = [{"*": default_fallbacks}]
         self.context_window_fallbacks = (
             context_window_fallbacks or litellm.context_window_fallbacks
         )
@@ -1471,13 +1480,21 @@ class Router:
                             pass
                 elif fallbacks is not None:
                     verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
-                    for item in fallbacks:
-                        key_list = list(item.keys())
-                        if len(key_list) == 0:
-                            continue
-                        if key_list[0] == model_group:
+                    generic_fallback_idx: Optional[int] = None
+                    ## check for specific model group-specific fallbacks
+                    for idx, item in enumerate(fallbacks):
+                        if list(item.keys())[0] == model_group:
                             fallback_model_group = item[model_group]
                             break
+                        elif list(item.keys())[0] == "*":
+                            generic_fallback_idx = idx
+                    ## if none, check for generic fallback
+                    if (
+                        fallback_model_group is None
+                        and generic_fallback_idx is not None
+                    ):
+                        fallback_model_group = fallbacks[generic_fallback_idx]["*"]
+
                     if fallback_model_group is None:
                         verbose_router_logger.info(
                             f"No fallback model group found for original model_group={model_group}. Fallbacks={fallbacks}"
@@ -1537,7 +1554,7 @@ class Router:
              
             """
             _healthy_deployments = await self._async_get_healthy_deployments(
-                model=kwargs.get("model"),
+                model=kwargs.get("model") or "",
             )
 
             # raises an exception if this error should not be retries
@@ -1644,12 +1661,18 @@ class Router:
         Try calling the function_with_retries
         If it fails after num_retries, fall back to another model group
         """
+        mock_testing_fallbacks = kwargs.pop("mock_testing_fallbacks", None)
         model_group = kwargs.get("model")
         fallbacks = kwargs.get("fallbacks", self.fallbacks)
         context_window_fallbacks = kwargs.get(
             "context_window_fallbacks", self.context_window_fallbacks
         )
         try:
+            if mock_testing_fallbacks is not None and mock_testing_fallbacks == True:
+                raise Exception(
+                    f"This is a mock exception for model={model_group}, to trigger a fallback. Fallbacks={fallbacks}"
+                )
+
             response = self.function_with_retries(*args, **kwargs)
             return response
         except Exception as e:
@@ -1658,7 +1681,7 @@ class Router:
             try:
                 if (
                     hasattr(e, "status_code")
-                    and e.status_code == 400
+                    and e.status_code == 400  # type: ignore
                     and not isinstance(e, litellm.ContextWindowExceededError)
                 ):  # don't retry a malformed request
                     raise e
@@ -1700,10 +1723,20 @@ class Router:
                 elif fallbacks is not None:
                     verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                     fallback_model_group = None
-                    for item in fallbacks:
+                    generic_fallback_idx: Optional[int] = None
+                    ## check for specific model group-specific fallbacks
+                    for idx, item in enumerate(fallbacks):
                         if list(item.keys())[0] == model_group:
                             fallback_model_group = item[model_group]
                             break
+                        elif list(item.keys())[0] == "*":
+                            generic_fallback_idx = idx
+                    ## if none, check for generic fallback
+                    if (
+                        fallback_model_group is None
+                        and generic_fallback_idx is not None
+                    ):
+                        fallback_model_group = fallbacks[generic_fallback_idx]["*"]
 
                     if fallback_model_group is None:
                         raise original_exception
diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py
index ce2b014e9..4ab97b274 100644
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@@ -1007,3 +1007,50 @@ async def test_service_unavailable_fallbacks(sync_mode):
         )
 
     assert response.model == "gpt-35-turbo"
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_default_model_fallbacks(sync_mode):
+    """
+    Related issue - https://github.com/BerriAI/litellm/issues/3623
+
+    If model misconfigured, setup a default model for generic fallback
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "bad-model",
+                "litellm_params": {
+                    "model": "openai/my-bad-model",
+                    "api_key": "my-bad-api-key",
+                },
+            },
+            {
+                "model_name": "my-good-model",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ],
+        default_fallbacks=["my-good-model"],
+    )
+
+    if sync_mode:
+        response = router.completion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+    else:
+        response = await router.acompletion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+
+    assert isinstance(response, litellm.ModelResponse)
+    assert response.model is not None and response.model == "gpt-4o"
diff --git a/litellm/types/router.py b/litellm/types/router.py
index e8f3ff641..68ee387fe 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, Dict, Tuple, Literal
+from typing import List, Optional, Union, Dict, Tuple, Literal, TypedDict
 import httpx
 from pydantic import BaseModel, validator, Field
 from .completion import CompletionRequest
@@ -277,6 +277,47 @@ class updateDeployment(BaseModel):
         protected_namespaces = ()
 
 
+class LiteLLMParamsTypedDict(TypedDict, total=False):
+    """
+    [TODO]
+    - allow additional params (not in list)
+    - set value to none if not set -> don't raise error if value not set
+    """
+
+    model: str
+    custom_llm_provider: Optional[str]
+    tpm: Optional[int]
+    rpm: Optional[int]
+    api_key: Optional[str]
+    api_base: Optional[str]
+    api_version: Optional[str]
+    timeout: Optional[Union[float, str, httpx.Timeout]]
+    stream_timeout: Optional[Union[float, str]]
+    max_retries: Optional[int]
+    organization: Optional[str]  # for openai orgs
+    ## UNIFIED PROJECT/REGION ##
+    region_name: Optional[str]
+    ## VERTEX AI ##
+    vertex_project: Optional[str]
+    vertex_location: Optional[str]
+    ## AWS BEDROCK / SAGEMAKER ##
+    aws_access_key_id: Optional[str]
+    aws_secret_access_key: Optional[str]
+    aws_region_name: Optional[str]
+    ## IBM WATSONX ##
+    watsonx_region_name: Optional[str]
+    ## CUSTOM PRICING ##
+    input_cost_per_token: Optional[float]
+    output_cost_per_token: Optional[float]
+    input_cost_per_second: Optional[float]
+    output_cost_per_second: Optional[float]
+
+
+class DeploymentTypedDict(TypedDict):
+    model_name: str
+    litellm_params: LiteLLMParamsTypedDict
+
+
 class Deployment(BaseModel):
     model_name: str
     litellm_params: LiteLLM_Params

From 38988f030ac74c8702473a76a43a070fe706635a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:06:10 -0700
Subject: [PATCH 02/10] fix(router.py): fix typing

---
 litellm/router.py                | 2 +-
 litellm/tests/test_azure_perf.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/router.py b/litellm/router.py
index fe8709294..b4603c6d0 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -63,7 +63,7 @@ class Router:
 
     def __init__(
         self,
-        model_list: Optional[List[DeploymentTypedDict]] = None,
+        model_list: Optional[List[Union[DeploymentTypedDict, Dict]]] = None,
         ## CACHING ##
         redis_url: Optional[str] = None,
         redis_host: Optional[str] = None,
diff --git a/litellm/tests/test_azure_perf.py b/litellm/tests/test_azure_perf.py
index 9654f1273..8afc59f92 100644
--- a/litellm/tests/test_azure_perf.py
+++ b/litellm/tests/test_azure_perf.py
@@ -26,7 +26,7 @@ model_list = [
     }
 ]
 
-router = litellm.Router(model_list=model_list)
+router = litellm.Router(model_list=model_list)  # type: ignore
 
 
 async def _openai_completion():

From 3694b5e7c078695a23d673aa28f3001bb869f938 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:12:01 -0700
Subject: [PATCH 03/10] refactor(main.py): trigger new build

---
 litellm/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index d20896631..6156d9c39 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -15,7 +15,6 @@ import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
-
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
     client,

From 29449aa5c1c70f879ba790dd0eba0c3308840dbc Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:13:13 -0700
Subject: [PATCH 04/10] fix(utils.py): fix watsonx exception mapping

---
 litellm/tests/test_completion.py | 2 ++
 litellm/utils.py                 | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 45a53ca56..d08a4ae3b 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -3342,6 +3342,8 @@ def test_completion_watsonx():
         print(response)
     except litellm.APIError as e:
         pass
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
diff --git a/litellm/utils.py b/litellm/utils.py
index 14534147b..4a8e7e691 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -8514,7 +8514,7 @@ def exception_type(
                     request=original_exception.request,
                 )
             elif custom_llm_provider == "watsonx":
-                if "token_quota_reached" in error_response:
+                if "token_quota_reached" in error_str:
                     exception_mapping_worked = True
                     raise RateLimitError(
                         message=f"WatsonxException: Rate Limit Errror - {error_str}",

From 228ed25de594198ee786efbf927e84178daab97b Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:16:05 -0700
Subject: [PATCH 05/10] docs(exception_mapping.md): add watsonx exception
 mapping to docs

---
 docs/my-website/docs/exception_mapping.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/my-website/docs/exception_mapping.md b/docs/my-website/docs/exception_mapping.md
index 2345e9f83..9ee93269f 100644
--- a/docs/my-website/docs/exception_mapping.md
+++ b/docs/my-website/docs/exception_mapping.md
@@ -111,6 +111,7 @@ Base case - we return the original exception.
 | custom_llm_provider        | Timeout | ContextWindowExceededError | BadRequestError | NotFoundError | ContentPolicyViolationError | AuthenticationError | APIError | RateLimitError | ServiceUnavailableError | PermissionDeniedError | UnprocessableEntityError |
 |----------------------------|---------|----------------------------|------------------|---------------|-----------------------------|---------------------|----------|----------------|-------------------------|-----------------------|-------------------------|
 | openai                     | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
+| watsonx                     |       | | | | | | |✓| | | |
 | text-completion-openai     | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
 | custom_openai              | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
 | openai_compatible_providers| ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |

From 4ec3b4d9a82dee9343c252d6e43eb2ebd49d7380 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:17:02 -0700
Subject: [PATCH 06/10] docs(exception_mapping.md): cleanup docs

---
 docs/my-website/docs/exception_mapping.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/my-website/docs/exception_mapping.md b/docs/my-website/docs/exception_mapping.md
index 9ee93269f..5e6006ebe 100644
--- a/docs/my-website/docs/exception_mapping.md
+++ b/docs/my-website/docs/exception_mapping.md
@@ -106,7 +106,7 @@ To see how it's implemented - [check out the code](https://github.com/BerriAI/li
 
 ## Custom mapping list
 
-Base case - we return the original exception.
+Base case - we return `litellm.APIConnectionError` exception (inherits from openai's APIConnectionError exception).
 
 | custom_llm_provider        | Timeout | ContextWindowExceededError | BadRequestError | NotFoundError | ContentPolicyViolationError | AuthenticationError | APIError | RateLimitError | ServiceUnavailableError | PermissionDeniedError | UnprocessableEntityError |
 |----------------------------|---------|----------------------------|------------------|---------------|-----------------------------|---------------------|----------|----------------|-------------------------|-----------------------|-------------------------|

From 155f1f164f1cd7e55eb1430990bbce45b949e6b8 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:18:22 -0700
Subject: [PATCH 07/10] refactor(utils.py): trigger local_testing

---
 litellm/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/utils.py b/litellm/utils.py
index 4a8e7e691..ab0f5c529 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -13,6 +13,7 @@ import dotenv, json, traceback, threading, base64, ast
 import subprocess, os
 from os.path import abspath, join, dirname
 import litellm, openai
+
 import itertools
 import random, uuid, requests  # type: ignore
 from functools import wraps

From d4123951d9e9ee463a11c8de3e26c26dc11b0a7e Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:27:39 -0700
Subject: [PATCH 08/10] test: handle watsonx rate limit error

---
 litellm/main.py                  | 1 +
 litellm/tests/test_completion.py | 2 ++
 litellm/tests/test_embedding.py  | 2 ++
 litellm/tests/test_streaming.py  | 5 ++++-
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index 6156d9c39..d20896631 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -15,6 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
+
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
     client,
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index d08a4ae3b..13c0d2f96 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -3447,6 +3447,8 @@ async def test_acompletion_stream_watsonx():
         # Add any assertions here to check the response
         async for chunk in response:
             print(chunk)
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 8da847b64..a441b0e70 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -494,6 +494,8 @@ def test_watsonx_embeddings():
         )
         print(f"response: {response}")
         assert isinstance(response.usage, litellm.Usage)
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index a948a5683..6dcdbeb17 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -456,7 +456,8 @@ def test_completion_claude_stream():
         print(f"completion_response: {complete_response}")
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
-    
+
+
 # test_completion_claude_stream()
 def test_completion_claude_2_stream():
     litellm.set_verbose = True
@@ -1416,6 +1417,8 @@ def test_completion_watsonx_stream():
             raise Exception("finish reason not set for last chunk")
         if complete_response.strip() == "":
             raise Exception("Empty response received")
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 

From 724d880a4513a5790add1674bfe0ae665b414bf7 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 18:40:51 -0700
Subject: [PATCH 09/10] test(test_completion.py): handle async watsonx call
 fail

---
 litellm/main.py                  | 1 -
 litellm/tests/test_completion.py | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index d20896631..6156d9c39 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -15,7 +15,6 @@ import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
-
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
     client,
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 13c0d2f96..5ee296197 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -3427,6 +3427,8 @@ async def test_acompletion_watsonx():
         )
         # Add any assertions here to check the response
         print(response)
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 

From 071a70c5fcbc8bcb64b62bbb790d314fbbe82638 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 13 May 2024 19:01:19 -0700
Subject: [PATCH 10/10] test: fix watsonx api error

---
 litellm/tests/test_completion.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 5ee296197..aa0baa2b4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -3363,6 +3363,8 @@ def test_completion_stream_watsonx():
             print(chunk)
     except litellm.APIError as e:
         pass
+    except litellm.RateLimitError as e:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")