diff --git a/docs/my-website/docs/projects/dbally.md b/docs/my-website/docs/projects/dbally.md
new file mode 100644
index 0000000000..688f1ab0ff
--- /dev/null
+++ b/docs/my-website/docs/projects/dbally.md
@@ -0,0 +1,3 @@
+Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
+
+🔗 [GitHub](https://github.com/deepsense-ai/db-ally)
diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 19c1f7902d..a50b3f6460 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -727,6 +727,7 @@ general_settings:
     "completion_model": "string",
     "disable_spend_logs": "boolean", # turn off writing each transaction to the db
     "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
+    "disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
     "disable_reset_budget": "boolean", # turn off reset budget scheduled task
     "disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
     "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
@@ -751,7 +752,8 @@ general_settings:
     },
     "otel": true,
     "custom_auth": "string",
-    "max_parallel_requests": 0,
+    "max_parallel_requests": 0, # the max parallel requests allowed per deployment 
+    "global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up 
     "infer_model_from_keys": true,
     "background_health_checks": true,
     "health_check_interval": 300,
diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
index 4b913d2e82..10e6456c21 100644
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@@ -68,6 +68,15 @@ http://localhost:4000/metrics
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |
 
+### Request Latency Metrics 
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_request_total_latency_metric`             | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
+| `litellm_llm_api_latency_metric`             | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
+
+
+
 ### LLM API / Provider Metrics
 
 | Metric Name          | Description                          |
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index b907a11304..4bb4125e0e 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -292,6 +292,7 @@ const sidebars = {
           items: [
             "projects/Docq.AI",
             "projects/OpenInterpreter",
+            "projects/dbally",
             "projects/FastREPL",
             "projects/PROMPTMETHEUS",
             "projects/Codium PR Agent",
diff --git a/litellm/_redis.py b/litellm/_redis.py
index d72016dcd9..23f82ed1a7 100644
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@@ -7,13 +7,17 @@
 #
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 
+import inspect
+
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
-import inspect
-import redis, litellm  # type: ignore
-import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional
 
+import redis  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
+
+import litellm
+
 
 def _get_redis_kwargs():
     arg_spec = inspect.getfullargspec(redis.Redis)
@@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
     return available_args
 
 
+def _get_redis_cluster_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.RedisCluster)
+
+    # Only allow primitive arguments
+    exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
+
+    available_args = [x for x in arg_spec.args if x not in exclude_args]
+
+    return available_args
+
+
 def _get_redis_env_kwarg_mapping():
     PREFIX = "REDIS_"
 
@@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
                 url_kwargs[arg] = redis_kwargs[arg]
 
         return redis.Redis.from_url(**url_kwargs)
+
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
     return redis.Redis(**redis_kwargs)
 
 
@@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
                 )
         return async_redis.Redis.from_url(**url_kwargs)
 
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return async_redis.RedisCluster(
+            startup_nodes=new_startup_nodes, **cluster_kwargs
+        )
+
     return async_redis.Redis(
         socket_timeout=5,
         **redis_kwargs,
@@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
         connection_class = async_redis.SSLConnection
         redis_kwargs.pop("ssl", None)
         redis_kwargs["connection_class"] = connection_class
+    redis_kwargs.pop("startup_nodes", None)
     return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
diff --git a/litellm/caching.py b/litellm/caching.py
index 1c72160295..1b19fdf3e5 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -203,6 +203,7 @@ class RedisCache(BaseCache):
         password=None,
         redis_flush_size=100,
         namespace: Optional[str] = None,
+        startup_nodes: Optional[List] = None,  # for redis-cluster
         **kwargs,
     ):
         import redis
@@ -218,7 +219,8 @@ class RedisCache(BaseCache):
             redis_kwargs["port"] = port
         if password is not None:
             redis_kwargs["password"] = password
-
+        if startup_nodes is not None:
+            redis_kwargs["startup_nodes"] = startup_nodes
         ### HEALTH MONITORING OBJECT ###
         if kwargs.get("service_logger_obj", None) is not None and isinstance(
             kwargs["service_logger_obj"], ServiceLogging
@@ -246,7 +248,7 @@ class RedisCache(BaseCache):
         ### ASYNC HEALTH PING ###
         try:
             # asyncio.get_running_loop().create_task(self.ping())
-            result = asyncio.get_running_loop().create_task(self.ping())
+            _ = asyncio.get_running_loop().create_task(self.ping())
         except Exception as e:
             if "no running event loop" in str(e):
                 verbose_logger.debug(
@@ -2123,6 +2125,7 @@ class Cache:
         redis_semantic_cache_use_async=False,
         redis_semantic_cache_embedding_model="text-embedding-ada-002",
         redis_flush_size=None,
+        redis_startup_nodes: Optional[List] = None,
         disk_cache_dir=None,
         qdrant_api_base: Optional[str] = None,
         qdrant_api_key: Optional[str] = None,
@@ -2155,7 +2158,12 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(
-                host, port, password, redis_flush_size, **kwargs
+                host,
+                port,
+                password,
+                redis_flush_size,
+                startup_nodes=redis_startup_nodes,
+                **kwargs,
             )
         elif type == "redis-semantic":
             self.cache = RedisSemanticCache(
diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
index 1471f59b75..659e5b193c 100644
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
                 ],
             )
 
+            # request latency metrics
+            self.litellm_request_total_latency_metric = Histogram(
+                "litellm_request_total_latency_metric",
+                "Total latency (seconds) for a request to LiteLLM",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
+            self.litellm_llm_api_latency_metric = Histogram(
+                "litellm_llm_api_latency_metric",
+                "Total latency (seconds) for a models LLM API call",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
             # Counter for spend
             self.litellm_spend_metric = Counter(
                 "litellm_spend_metric",
@@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
             user_api_key, user_api_key_alias, model_group
         ).set(remaining_tokens)
 
+        # latency metrics
+        total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
+        total_time_seconds = total_time.total_seconds()
+        api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
+            "api_call_start_time"
+        )
+
+        api_call_total_time_seconds = api_call_total_time.total_seconds()
+
+        litellm_call_id = kwargs.get("litellm_call_id")
+
+        self.litellm_request_total_latency_metric.labels(
+            model, litellm_call_id
+        ).observe(total_time_seconds)
+
+        self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
+            api_call_total_time_seconds
+        )
+
         # set x-ratelimit headers
         if premium_user is True:
             self.set_llm_deployment_success_metrics(
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index d59f985584..dbf2a7d3e5 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -354,6 +354,8 @@ class Logging:
                             str(e)
                         )
                     )
+
+            self.model_call_details["api_call_start_time"] = datetime.datetime.now()
             # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
             callbacks = litellm.input_callback + self.dynamic_input_callbacks
             for callback in callbacks:
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 2c888a4f30..96a0242a8e 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -2,12 +2,3 @@ model_list:
   - model_name: "*"
     litellm_params:
       model: "*"
-
-litellm_settings:
-  success_callback: ["s3"]
-  cache: true
-  s3_callback_params:
-    s3_bucket_name: mytestbucketlitellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py
index cf5065c2e1..0f1452651e 100644
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@@ -66,7 +66,7 @@ def common_checks(
         raise Exception(
             f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
         )
-    # 2. If user can call model
+    # 2. If team can call model
     if (
         _model is not None
         and team_object is not None
@@ -74,7 +74,11 @@ def common_checks(
         and _model not in team_object.models
     ):
         # this means the team has access to all models on the proxy
-        if "all-proxy-models" in team_object.models:
+        if (
+            "all-proxy-models" in team_object.models
+            or "*" in team_object.models
+            or "openai/*" in team_object.models
+        ):
             # this means the team has access to all models on the proxy
             pass
         # check if the team model is an access_group
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 6b831876f0..5f4072434f 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -1,8 +1,9 @@
 model_list:
   - model_name: fake-openai-endpoint
     litellm_params:
-      model: openai/gpt-4
-      api_key: os.environ/OPENAI_API_KEY
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 
 guardrails:
   - guardrail_name: "bedrock-pre-guard"
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 0a9abc09ad..c793ffbe3f 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1588,7 +1588,7 @@ class ProxyConfig:
                         verbose_proxy_logger.debug(  # noqa
                             f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
                         )
-                elif key == "cache" and value == False:
+                elif key == "cache" and value is False:
                     pass
                 elif key == "guardrails":
                     if premium_user is not True:
@@ -2672,6 +2672,13 @@ def giveup(e):
         and isinstance(e.message, str)
         and "Max parallel request limit reached" in e.message
     )
+
+    if (
+        general_settings.get("disable_retry_on_max_parallel_request_limit_error")
+        is True
+    ):
+        return True  # giveup if queuing max parallel request limits is disabled
+
     if result:
         verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
     return result
diff --git a/litellm/router.py b/litellm/router.py
index e261c1743d..7a938f5c4e 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -277,7 +277,8 @@ class Router:
             "local"  # default to an in-memory cache
         )
         redis_cache = None
-        cache_config = {}
+        cache_config: Dict[str, Any] = {}
+
         self.client_ttl = client_ttl
         if redis_url is not None or (
             redis_host is not None
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 64196e5c56..e474dff2e4 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
 # test_redis_cache_completion_stream()
 
 
+@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
+@pytest.mark.asyncio
+async def test_redis_cache_cluster_init_unit_test():
+    try:
+        from redis.asyncio import RedisCluster as AsyncRedisCluster
+        from redis.cluster import RedisCluster
+
+        from litellm.caching import RedisCache
+
+        litellm.set_verbose = True
+
+        # List of startup nodes
+        startup_nodes = [
+            {"host": "127.0.0.1", "port": "7001"},
+        ]
+
+        resp = RedisCache(startup_nodes=startup_nodes)
+
+        assert isinstance(resp.redis_client, RedisCluster)
+        assert isinstance(resp.init_async_client(), AsyncRedisCluster)
+
+        resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
+
+        assert isinstance(resp.cache, RedisCache)
+        assert isinstance(resp.cache.redis_client, RedisCluster)
+        assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
+
+    except Exception as e:
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
+        raise e
+
+
 @pytest.mark.asyncio
 async def test_redis_cache_acompletion_stream():
     try: