From 33c9c16388fbcf013e7a4538b087cfe5f5c323c1 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 21 Aug 2024 15:01:52 -0700
Subject: [PATCH 01/11] feat(caching.py): redis cluster support

Closes https://github.com/BerriAI/litellm/issues/4358
---
 litellm/_redis.py                     | 58 +++++++++++++++++++++++++--
 litellm/caching.py                    | 14 +++++--
 litellm/proxy/_new_secret_config.yaml |  7 ++++
 litellm/proxy/proxy_server.py         |  2 +-
 litellm/tests/test_caching.py         | 32 +++++++++++++++
 5 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/litellm/_redis.py b/litellm/_redis.py
index d72016dcd9..23f82ed1a7 100644
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@@ -7,13 +7,17 @@
 #
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 
+import inspect
+
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
-import inspect
-import redis, litellm  # type: ignore
-import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional
 
+import redis  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
+
+import litellm
+
 
 def _get_redis_kwargs():
     arg_spec = inspect.getfullargspec(redis.Redis)
@@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
     return available_args
 
 
+def _get_redis_cluster_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.RedisCluster)
+
+    # Only allow primitive arguments
+    exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
+
+    available_args = [x for x in arg_spec.args if x not in exclude_args]
+
+    return available_args
+
+
 def _get_redis_env_kwarg_mapping():
     PREFIX = "REDIS_"
 
@@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
                 url_kwargs[arg] = redis_kwargs[arg]
 
         return redis.Redis.from_url(**url_kwargs)
+
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
     return redis.Redis(**redis_kwargs)
 
 
@@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
                 )
         return async_redis.Redis.from_url(**url_kwargs)
 
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return async_redis.RedisCluster(
+            startup_nodes=new_startup_nodes, **cluster_kwargs
+        )
+
     return async_redis.Redis(
         socket_timeout=5,
         **redis_kwargs,
@@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
         connection_class = async_redis.SSLConnection
         redis_kwargs.pop("ssl", None)
         redis_kwargs["connection_class"] = connection_class
+    redis_kwargs.pop("startup_nodes", None)
     return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
diff --git a/litellm/caching.py b/litellm/caching.py
index 1c72160295..1b19fdf3e5 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -203,6 +203,7 @@ class RedisCache(BaseCache):
         password=None,
         redis_flush_size=100,
         namespace: Optional[str] = None,
+        startup_nodes: Optional[List] = None,  # for redis-cluster
         **kwargs,
     ):
         import redis
@@ -218,7 +219,8 @@ class RedisCache(BaseCache):
             redis_kwargs["port"] = port
         if password is not None:
             redis_kwargs["password"] = password
-
+        if startup_nodes is not None:
+            redis_kwargs["startup_nodes"] = startup_nodes
         ### HEALTH MONITORING OBJECT ###
         if kwargs.get("service_logger_obj", None) is not None and isinstance(
             kwargs["service_logger_obj"], ServiceLogging
@@ -246,7 +248,7 @@ class RedisCache(BaseCache):
         ### ASYNC HEALTH PING ###
         try:
             # asyncio.get_running_loop().create_task(self.ping())
-            result = asyncio.get_running_loop().create_task(self.ping())
+            _ = asyncio.get_running_loop().create_task(self.ping())
         except Exception as e:
             if "no running event loop" in str(e):
                 verbose_logger.debug(
@@ -2123,6 +2125,7 @@ class Cache:
         redis_semantic_cache_use_async=False,
         redis_semantic_cache_embedding_model="text-embedding-ada-002",
         redis_flush_size=None,
+        redis_startup_nodes: Optional[List] = None,
         disk_cache_dir=None,
         qdrant_api_base: Optional[str] = None,
         qdrant_api_key: Optional[str] = None,
@@ -2155,7 +2158,12 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(
-                host, port, password, redis_flush_size, **kwargs
+                host,
+                port,
+                password,
+                redis_flush_size,
+                startup_nodes=redis_startup_nodes,
+                **kwargs,
             )
         elif type == "redis-semantic":
             self.cache = RedisSemanticCache(
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 96a0242a8e..10d608ec89 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -2,3 +2,10 @@ model_list:
   - model_name: "*"
     litellm_params:
       model: "*"
+
+
+litellm_settings:
+  cache: True
+  cache_params:
+    type: redis
+    redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}]
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index a9d0325d80..8986b587b7 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1583,7 +1583,7 @@ class ProxyConfig:
                         verbose_proxy_logger.debug(  # noqa
                             f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
                         )
-                elif key == "cache" and value == False:
+                elif key == "cache" and value is False:
                     pass
                 elif key == "guardrails":
                     if premium_user is not True:
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 64196e5c56..5da883f4ae 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
 # test_redis_cache_completion_stream()
 
 
+# @pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
+@pytest.mark.asyncio
+async def test_redis_cache_cluster_init_unit_test():
+    try:
+        from redis.asyncio import RedisCluster as AsyncRedisCluster
+        from redis.cluster import RedisCluster
+
+        from litellm.caching import RedisCache
+
+        litellm.set_verbose = True
+
+        # List of startup nodes
+        startup_nodes = [
+            {"host": "127.0.0.1", "port": "7001"},
+        ]
+
+        resp = RedisCache(startup_nodes=startup_nodes)
+
+        assert isinstance(resp.redis_client, RedisCluster)
+        assert isinstance(resp.init_async_client(), AsyncRedisCluster)
+
+        resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
+
+        assert isinstance(resp.cache, RedisCache)
+        assert isinstance(resp.cache.redis_client, RedisCluster)
+        assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
+
+    except Exception as e:
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
+        raise e
+
+
 @pytest.mark.asyncio
 async def test_redis_cache_acompletion_stream():
     try:

From ea7968e22eb5e7f7d176da5cecf5aa95ff2a64e9 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 21 Aug 2024 15:05:18 -0700
Subject: [PATCH 02/11] test(test_caching.py): skip local test

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 5da883f4ae..e474dff2e4 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -804,7 +804,7 @@ def test_redis_cache_completion_stream():
 # test_redis_cache_completion_stream()
 
 
-# @pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
+@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
 @pytest.mark.asyncio
 async def test_redis_cache_cluster_init_unit_test():
     try:

From 45048ee006eccd9b89d271e08355ddeebb5340ea Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 21 Aug 2024 15:35:10 -0700
Subject: [PATCH 03/11] fix(router.py): fix linting error

---
 litellm/router.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/router.py b/litellm/router.py
index e261c1743d..7a938f5c4e 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -277,7 +277,8 @@ class Router:
             "local"  # default to an in-memory cache
         )
         redis_cache = None
-        cache_config = {}
+        cache_config: Dict[str, Any] = {}
+
         self.client_ttl = client_ttl
         if redis_url is not None or (
             redis_host is not None

From 8162208a5cdcb66e8707421980ca6f59428e540a Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 22 Aug 2024 13:52:03 -0700
Subject: [PATCH 04/11] track api_call_start_time

---
 litellm/litellm_core_utils/litellm_logging.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index d59f985584..dbf2a7d3e5 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -354,6 +354,8 @@ class Logging:
                             str(e)
                         )
                     )
+
+            self.model_call_details["api_call_start_time"] = datetime.datetime.now()
             # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
             callbacks = litellm.input_callback + self.dynamic_input_callbacks
             for callback in callbacks:

From c719c375f70f850816cfeb1b781164da24ba5561 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 22 Aug 2024 13:58:10 -0700
Subject: [PATCH 05/11] track litellm_request_latency_metric

---
 litellm/integrations/prometheus.py | 38 ++++++++++++++++++++++++++++++
 litellm/proxy/proxy_config.yaml    |  4 +++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
index 1471f59b75..dadafa80e3 100644
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
                 ],
             )
 
+            # request latency metrics
+            self.litellm_request_latency_metric = Histogram(
+                "litellm_request_latency_metric",
+                "Total latency (seconds) for a request to LiteLLM",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
+            self.litellm_deployment_latency_metric = Histogram(
+                "litellm_deployment_latency_metric",
+                "Total latency (seconds) for a models LLM API call",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
             # Counter for spend
             self.litellm_spend_metric = Counter(
                 "litellm_spend_metric",
@@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
             user_api_key, user_api_key_alias, model_group
         ).set(remaining_tokens)
 
+        # latency metrics
+        total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
+        total_time_seconds = total_time.total_seconds()
+        api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
+            "api_call_start_time"
+        )
+
+        api_call_total_time_seconds = api_call_total_time.total_seconds()
+
+        litellm_call_id = kwargs.get("litellm_call_id")
+
+        self.litellm_request_latency_metric.labels(model, litellm_call_id).observe(
+            total_time_seconds
+        )
+
+        self.litellm_deployment_latency_metric.labels(model, litellm_call_id).observe(
+            api_call_total_time_seconds
+        )
+
         # set x-ratelimit headers
         if premium_user is True:
             self.set_llm_deployment_success_metrics(
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 65c7f70525..7c524eb18a 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -4,7 +4,9 @@ model_list:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
+litellm_settings:
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
 guardrails:
   - guardrail_name: "lakera-pre-guard"
     litellm_params:

From 9476582fb7091d418d8df1ed918d0db7e21c701c Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 22 Aug 2024 14:03:00 -0700
Subject: [PATCH 06/11] update promtheus metric names

---
 litellm/integrations/prometheus.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
index dadafa80e3..659e5b193c 100644
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@@ -61,8 +61,8 @@ class PrometheusLogger(CustomLogger):
             )
 
             # request latency metrics
-            self.litellm_request_latency_metric = Histogram(
-                "litellm_request_latency_metric",
+            self.litellm_request_total_latency_metric = Histogram(
+                "litellm_request_total_latency_metric",
                 "Total latency (seconds) for a request to LiteLLM",
                 labelnames=[
                     "model",
@@ -70,8 +70,8 @@ class PrometheusLogger(CustomLogger):
                 ],
             )
 
-            self.litellm_deployment_latency_metric = Histogram(
-                "litellm_deployment_latency_metric",
+            self.litellm_llm_api_latency_metric = Histogram(
+                "litellm_llm_api_latency_metric",
                 "Total latency (seconds) for a models LLM API call",
                 labelnames=[
                     "model",
@@ -359,11 +359,11 @@ class PrometheusLogger(CustomLogger):
 
         litellm_call_id = kwargs.get("litellm_call_id")
 
-        self.litellm_request_latency_metric.labels(model, litellm_call_id).observe(
-            total_time_seconds
-        )
+        self.litellm_request_total_latency_metric.labels(
+            model, litellm_call_id
+        ).observe(total_time_seconds)
 
-        self.litellm_deployment_latency_metric.labels(model, litellm_call_id).observe(
+        self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
             api_call_total_time_seconds
         )
 

From 55d35c2ea12a4e82fd2a2fc416aec1bc3b669e34 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 22 Aug 2024 14:06:14 -0700
Subject: [PATCH 07/11] add prom docs for Request Latency Metrics

---
 docs/my-website/docs/proxy/prometheus.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
index 4b913d2e82..10e6456c21 100644
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@@ -68,6 +68,15 @@ http://localhost:4000/metrics
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |
 
+### Request Latency Metrics 
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_request_total_latency_metric`             | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
+| `litellm_llm_api_latency_metric`             | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
+
+
+
 ### LLM API / Provider Metrics
 
 | Metric Name          | Description                          |

From 2568b5536ddf047d89f2a24f561af0beae091824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= <michal.pstrag@icloud.com>
Date: Thu, 22 Aug 2024 23:21:40 +0200
Subject: [PATCH 08/11] add dbally project

---
 docs/my-website/docs/projects/dbally.md | 3 +++
 docs/my-website/sidebars.js             | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 docs/my-website/docs/projects/dbally.md

diff --git a/docs/my-website/docs/projects/dbally.md b/docs/my-website/docs/projects/dbally.md
new file mode 100644
index 0000000000..688f1ab0ff
--- /dev/null
+++ b/docs/my-website/docs/projects/dbally.md
@@ -0,0 +1,3 @@
+Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
+
+🔗 [GitHub](https://github.com/deepsense-ai/db-ally)
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index 54df1f3e35..368609d4bf 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -261,6 +261,7 @@ const sidebars = {
           items: [
             "projects/Docq.AI",
             "projects/OpenInterpreter",
+            "projects/dbally",
             "projects/FastREPL",
             "projects/PROMPTMETHEUS",
             "projects/Codium PR Agent",

From 73a5921262d9749072fdb0e1b7b694bf3568ee66 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 22 Aug 2024 16:37:46 -0700
Subject: [PATCH 09/11] feat(auth_checks.py): allow team to call all models,
 when explicitly set via /*

---
 litellm/proxy/_new_secret_config.yaml | 9 ---------
 litellm/proxy/auth/auth_checks.py     | 8 ++++++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 66530c7db0..96a0242a8e 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -2,12 +2,3 @@ model_list:
   - model_name: "*"
     litellm_params:
       model: "*"
-
-litellm_settings:
-  success_callback: ["s3"]
-  cache: true
-  s3_callback_params:
-    s3_bucket_name: mytestbucketlitellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
\ No newline at end of file
diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py
index cf5065c2e1..0f1452651e 100644
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@@ -66,7 +66,7 @@ def common_checks(
         raise Exception(
             f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
         )
-    # 2. If user can call model
+    # 2. If team can call model
     if (
         _model is not None
         and team_object is not None
@@ -74,7 +74,11 @@ def common_checks(
         and _model not in team_object.models
     ):
         # this means the team has access to all models on the proxy
-        if "all-proxy-models" in team_object.models:
+        if (
+            "all-proxy-models" in team_object.models
+            or "*" in team_object.models
+            or "openai/*" in team_object.models
+        ):
             # this means the team has access to all models on the proxy
             pass
         # check if the team model is an access_group

From b0706a6f8f4912191d6e28206c613dd8250fa228 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 22 Aug 2024 16:49:52 -0700
Subject: [PATCH 10/11] fix(proxy_server.py): expose flag to disable retries
 when max parallel request limit is hit

---
 docs/my-website/docs/proxy/configs.md | 1 +
 litellm/proxy/proxy_server.py         | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 19c1f7902d..d08d6324da 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -727,6 +727,7 @@ general_settings:
     "completion_model": "string",
     "disable_spend_logs": "boolean", # turn off writing each transaction to the db
     "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
+    "disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
     "disable_reset_budget": "boolean", # turn off reset budget scheduled task
     "disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
     "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 6bd528def8..c793ffbe3f 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -2672,6 +2672,13 @@ def giveup(e):
         and isinstance(e.message, str)
         and "Max parallel request limit reached" in e.message
     )
+
+    if (
+        general_settings.get("disable_retry_on_max_parallel_request_limit_error")
+        is True
+    ):
+        return True  # giveup if queuing max parallel request limits is disabled
+
     if result:
         verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
     return result

From 849cfa9bdee82ad0c85633aa4be608fa973126f9 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 22 Aug 2024 17:12:52 -0700
Subject: [PATCH 11/11] docs(configs.md): add global_max_parallel_requests to
 docs

---
 docs/my-website/docs/proxy/configs.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index d08d6324da..a50b3f6460 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -752,7 +752,8 @@ general_settings:
     },
     "otel": true,
     "custom_auth": "string",
-    "max_parallel_requests": 0,
+    "max_parallel_requests": 0, # the max parallel requests allowed per deployment 
+    "global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up 
     "infer_model_from_keys": true,
     "background_health_checks": true,
     "health_check_interval": 300,