From 886c859519c8bd8eddcdeb51c9946036a4042043 Mon Sep 17 00:00:00 2001
From: CLARKBENHAM <clark.benham@gmail.com>
Date: Mon, 8 Apr 2024 21:20:59 -0700
Subject: [PATCH 1/3] doc pre_call_check: enables router rate limits for
 concurrent calls

---
 docs/my-website/docs/routing.md | 327 ++++++++++++++++----------------
 1 file changed, 168 insertions(+), 159 deletions(-)
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index 404c72e44..3fda19094 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -28,40 +28,40 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI
 ```python
 from litellm import Router
 
-model_list = [{ # list of model deployments 
+model_list = [{ # list of model deployments
 	"model_name": "gpt-3.5-turbo", # model alias -> loadbalance between models with same `model_name`
-	"litellm_params": { # params for litellm completion/embedding call 
+	"litellm_params": { # params for litellm completion/embedding call
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-functioncalling", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "azure/chatgpt-functioncalling",
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "gpt-3.5-turbo", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "gpt-3.5-turbo",
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }, {
-    "model_name": "gpt-4", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/gpt-4", 
+    "model_name": "gpt-4",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "azure/gpt-4",
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 	}
 }, {
-    "model_name": "gpt-4", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "gpt-4", 
+    "model_name": "gpt-4",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "gpt-4",
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 },
@@ -72,14 +72,14 @@ router = Router(model_list=model_list)
 
 # openai.ChatCompletion.create replacement
 # requests with model="gpt-3.5-turbo" will pick a deployment where model_name="gpt-3.5-turbo"
-response = await router.acompletion(model="gpt-3.5-turbo", 
+response = await router.acompletion(model="gpt-3.5-turbo",
 				messages=[{"role": "user", "content": "Hey, how's it going?"}])
 
 print(response)
 
 # openai.ChatCompletion.create replacement
 # requests with model="gpt-4" will pick a deployment where model_name="gpt-4"
-response = await router.acompletion(model="gpt-4", 
+response = await router.acompletion(model="gpt-4",
 				messages=[{"role": "user", "content": "Hey, how's it going?"}])
 
 print(response)
@@ -98,7 +98,7 @@ print(response)
 ### Advanced
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
 
-Router provides 4 strategies for routing your calls across multiple deployments: 
+Router provides 4 strategies for routing your calls across multiple deployments:
 
 <Tabs>
 <TabItem value="latency-based" label="Latency-Based">
@@ -111,13 +111,16 @@ It caches, and updates the response times for deployments based on when a reques
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
 
 ```python
-from litellm import Router 
+from litellm import Router
 import asyncio
 
 model_list = [{ ... }]
 
 # init router
-router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
+router = Router(model_list=model_list,
+				routing_strategy="latency-based-routing",# 👈 set routing strategy
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)
 
 ## CALL 1+2
 tasks = []
@@ -128,7 +131,7 @@ for _ in range(2):
 response = await asyncio.gather(*tasks)
 
 if response is not None:
-	## CALL 3 
+	## CALL 3
 	await asyncio.sleep(1)  # let the cache update happen
 	picked_deployment = router.lowestlatency_logger.get_available_deployments(
 		model_group=model, healthy_deployments=router.healthy_deployments
@@ -142,12 +145,12 @@ if response is not None:
 	)
 ```
 
-### Set Time Window 
+### Set Time Window
 
-Set time window for how far back to consider when averaging latency for a deployment. 
+Set time window for how far back to consider when averaging latency for a deployment.
 
 **In Router**
-```python 
+```python
 router = Router(..., routing_strategy_args={"ttl": 10})
 ```
 
@@ -166,12 +169,12 @@ router_settings:
 If `rpm` or `tpm` is not provided, it randomly picks a deployment
 
 ```python
-from litellm import Router 
+from litellm import Router
 import asyncio
 
-model_list = [{ # list of model deployments 
-	"model_name": "gpt-3.5-turbo", # model alias 
-	"litellm_params": { # params for litellm completion/embedding call 
+model_list = [{ # list of model deployments
+	"model_name": "gpt-3.5-turbo", # model alias
+	"litellm_params": { # params for litellm completion/embedding call
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
@@ -179,18 +182,18 @@ model_list = [{ # list of model deployments
 		"rpm": 900,			# requests per minute for this API
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-functioncalling", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "azure/chatgpt-functioncalling",
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"rpm": 10,
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "gpt-3.5-turbo", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "gpt-3.5-turbo",
 		"api_key": os.getenv("OPENAI_API_KEY"),
 		"rpm": 10,
 	}
@@ -200,7 +203,7 @@ model_list = [{ # list of model deployments
 router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo", 
+		model="gpt-3.5-turbo",
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -211,33 +214,33 @@ asyncio.run(router_acompletion())
 </TabItem>
 <TabItem value="usage-based" label="Rate-Limit Aware">
 
-This will route to the deployment with the lowest TPM usage for that minute. 
+This will route to the deployment with the lowest TPM usage for that minute.
 
-In production, we use Redis to track usage (TPM/RPM) across multiple deployments. 
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments.
 
-If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. 
+If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded.
 
-For Azure, your RPM = TPM/6. 
+For Azure, your RPM = TPM/6.
 
 
 ```python
-from litellm import Router 
+from litellm import Router
 
 
-model_list = [{ # list of model deployments 
-	"model_name": "gpt-3.5-turbo", # model alias 
-	"litellm_params": { # params for litellm completion/embedding call 
+model_list = [{ # list of model deployments
+	"model_name": "gpt-3.5-turbo", # model alias
+	"litellm_params": { # params for litellm completion/embedding call
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
-	}, 
+	},
     "tpm": 100000,
 	"rpm": 10000,
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-functioncalling", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "azure/chatgpt-functioncalling",
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
@@ -245,22 +248,24 @@ model_list = [{ # list of model deployments
     "tpm": 100000,
 	"rpm": 1000,
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "gpt-3.5-turbo", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "gpt-3.5-turbo",
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	},
     "tpm": 100000,
 	"rpm": 1000,
 }]
-router = Router(model_list=model_list, 
-                redis_host=os.environ["REDIS_HOST"], 
-				redis_password=os.environ["REDIS_PASSWORD"], 
-				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="usage-based-routing")
+router = Router(model_list=model_list,
+                redis_host=os.environ["REDIS_HOST"],
+				redis_password=os.environ["REDIS_PASSWORD"],
+				redis_port=os.environ["REDIS_PORT"],
+                routing_strategy="usage-based-routing",
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+								)
 
 
-response = await router.acompletion(model="gpt-3.5-turbo", 
+response = await router.acompletion(model="gpt-3.5-turbo",
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]
 
 print(response)
@@ -276,29 +281,29 @@ Picks a deployment with the least number of ongoing calls, it's handling.
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
 
 ```python
-from litellm import Router 
+from litellm import Router
 import asyncio
 
-model_list = [{ # list of model deployments 
-	"model_name": "gpt-3.5-turbo", # model alias 
-	"litellm_params": { # params for litellm completion/embedding call 
+model_list = [{ # list of model deployments
+	"model_name": "gpt-3.5-turbo", # model alias
+	"litellm_params": { # params for litellm completion/embedding call
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-functioncalling", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "azure/chatgpt-functioncalling",
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "gpt-3.5-turbo", 
+    "model_name": "gpt-3.5-turbo",
+	"litellm_params": { # params for litellm completion/embedding call
+		"model": "gpt-3.5-turbo",
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }]
@@ -307,7 +312,7 @@ model_list = [{ # list of model deployments
 router = Router(model_list=model_list, routing_strategy="least-busy")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo", 
+		model="gpt-3.5-turbo",
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -322,18 +327,18 @@ asyncio.run(router_acompletion())
 
 ## Basic Reliability
 
-### Timeouts 
+### Timeouts
 
-The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
+The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
 
 **Global Timeouts**
 ```python
-from litellm import Router 
+from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list, 
-                timeout=30) # raise timeout error if call takes > 30s 
+router = Router(model_list=model_list,
+                timeout=30) # raise timeout error if call takes > 30s
 
 print(response)
 ```
@@ -341,7 +346,7 @@ print(response)
 **Timeouts per model**
 
 ```python
-from litellm import Router 
+from litellm import Router
 import asyncio
 
 model_list = [{
@@ -360,7 +365,7 @@ model_list = [{
 router = Router(model_list=model_list, routing_strategy="least-busy")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo", 
+		model="gpt-3.5-turbo",
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -370,20 +375,20 @@ asyncio.run(router_acompletion())
 ```
 ### Cooldowns
 
-Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. 
+Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
 
 ```python
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list, 
-                allowed_fails=1) # cooldown model if it fails > 1 call in a minute. 
+router = Router(model_list=model_list,
+                allowed_fails=1) # cooldown model if it fails > 1 call in a minute.
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call 
+# normal call
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
@@ -392,55 +397,55 @@ print(f"response: {response}")
 
 ### Retries
 
-For both async + sync functions, we support retrying failed requests. 
+For both async + sync functions, we support retrying failed requests.
 
-For RateLimitError we implement exponential backoffs 
+For RateLimitError we implement exponential backoffs
 
-For generic errors, we retry immediately 
+For generic errors, we retry immediately
 
-Here's a quick look at how we can set `num_retries = 3`: 
+Here's a quick look at how we can set `num_retries = 3`:
 
-```python 
+```python
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,  
+router = Router(model_list=model_list,
                 num_retries=3)
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call 
+# normal call
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. 
+We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param.
 
-```python 
+```python
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,  
+router = Router(model_list=model_list,
                 num_retries=3, retry_after=5) # waits min 5s before retrying request
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call 
+# normal call
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-### Fallbacks 
+### Fallbacks
 
-If a call fails after num_retries, fall back to another model group. 
+If a call fails after num_retries, fall back to another model group.
 
-If the error is a context window exceeded error, fall back to a larger model group (if given). 
+If the error is a context window exceeded error, fall back to a larger model group (if given).
 
 Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
 
@@ -448,52 +453,52 @@ Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'g
 from litellm import Router
 
 model_list = [
-    { # list of model deployments 
-		"model_name": "azure/gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-v-2", 
+    { # list of model deployments
+		"model_name": "azure/gpt-3.5-turbo", # openai model name
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	}, 
-    { # list of model deployments 
-		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-v-2", 
+	},
+    { # list of model deployments
+		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	}, 
+	},
 	{
-		"model_name": "azure/gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-functioncalling", 
+		"model_name": "azure/gpt-3.5-turbo", # openai model name
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "azure/chatgpt-functioncalling",
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	}, 
+	},
 	{
-		"model_name": "gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "gpt-3.5-turbo", 
+		"model_name": "gpt-3.5-turbo", # openai model name
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo",
 			"api_key": os.getenv("OPENAI_API_KEY"),
 		},
 		"tpm": 1000000,
 		"rpm": 9000
 	},
     {
-		"model_name": "gpt-3.5-turbo-16k", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "gpt-3.5-turbo-16k", 
+		"model_name": "gpt-3.5-turbo-16k", # openai model name
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo-16k",
 			"api_key": os.getenv("OPENAI_API_KEY"),
 		},
 		"tpm": 1000000,
@@ -502,8 +507,8 @@ model_list = [
 ]
 
 
-router = Router(model_list=model_list, 
-                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+router = Router(model_list=model_list,
+                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
                 context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
                 set_verbose=True)
 
@@ -511,7 +516,7 @@ router = Router(model_list=model_list,
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal fallback call 
+# normal fallback call
 response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
 
 # context window fallback call
@@ -522,12 +527,12 @@ print(f"response: {response}")
 
 ### Caching
 
-In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
+In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
 
 **In-memory Cache**
 
 ```python
-router = Router(model_list=model_list, 
+router = Router(model_list=model_list,
                 cache_responses=True)
 
 print(response)
@@ -535,19 +540,19 @@ print(response)
 
 **Redis Cache**
 ```python
-router = Router(model_list=model_list, 
-                redis_host=os.getenv("REDIS_HOST"), 
-                redis_password=os.getenv("REDIS_PASSWORD"), 
+router = Router(model_list=model_list,
+                redis_host=os.getenv("REDIS_HOST"),
+                redis_password=os.getenv("REDIS_PASSWORD"),
                 redis_port=os.getenv("REDIS_PORT"),
                 cache_responses=True)
 
 print(response)
 ```
 
-**Pass in Redis URL, additional kwargs** 
-```python 
+**Pass in Redis URL, additional kwargs**
+```python
 router = Router(model_list: Optional[list] = None,
-                 ## CACHING ## 
+                 ## CACHING ##
                  redis_url=os.getenv("REDIS_URL")",
 				 cache_kwargs= {}, # additional kwargs to pass to RedisCache (see caching.py)
 				 cache_responses=True)
@@ -555,14 +560,18 @@ router = Router(model_list: Optional[list] = None,
 
 ## Pre-Call Checks (Context Window)
 
-Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+Enable pre-call checks to filter out:
+1. deployments with context window limit < messages for a call.
+2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
+        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
+    ])`)
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 
 **1. Enable pre-call checks**
-```python 
-from litellm import Router 
+```python
+from litellm import Router
 # ...
 router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
 ```
@@ -570,7 +579,7 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t
 
 **2. Set Model List**
 
-For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
 
 <Tabs>
 <TabItem value="same-group" label="Same Group">
@@ -598,7 +607,7 @@ model_list = [
             },
         ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True) 
+router = Router(model_list=model_list, enable_pre_call_checks=True)
 ```
 
 </TabItem>
@@ -627,7 +636,7 @@ model_list = [
                 },
             },
             {
-                "model_name": "claude-opus", 
+                "model_name": "claude-opus",
                 "litellm_params": {  call
                     "model": "claude-3-opus-20240229",
                     "api_key": os.getenv("ANTHROPIC_API_KEY"),
@@ -635,7 +644,7 @@ model_list = [
             },
         ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) 
+router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}])
 ```
 
 </TabItem>
@@ -664,7 +673,7 @@ model_list = [
 			"api_base": os.getenv("AZURE_API_BASE"),
 		},
 		"model_info": {
-			"base_model": "azure/gpt-35-turbo", 
+			"base_model": "azure/gpt-35-turbo",
 		}
 	},
 	{
@@ -676,7 +685,7 @@ model_list = [
 	},
 ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True) 
+router = Router(model_list=model_list, enable_pre_call_checks=True)
 
 text = "What is the meaning of 42?" * 5000
 
@@ -701,11 +710,11 @@ Go [here](./proxy/reliability.md#advanced---context-window-fallbacks) for how to
 
 ## Caching across model groups
 
-If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
+If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups.
 
 ```python
 import litellm, asyncio, time
-from litellm import Router 
+from litellm import Router
 
 # set os env
 os.environ["OPENAI_API_KEY"] = ""
@@ -713,8 +722,8 @@ os.environ["AZURE_API_KEY"] = ""
 os.environ["AZURE_API_BASE"] = ""
 os.environ["AZURE_API_VERSION"] = ""
 
-async def test_acompletion_caching_on_router_caching_groups(): 
-	# tests acompletion + caching on router 
+async def test_acompletion_caching_on_router_caching_groups():
+	# tests acompletion + caching on router
 	try:
 		litellm.set_verbose = True
 		model_list = [
@@ -740,8 +749,8 @@ async def test_acompletion_caching_on_router_caching_groups():
 			{"role": "user", "content": f"write a one sentence poem {time.time()}?"}
 		]
 		start_time = time.time()
-		router = Router(model_list=model_list, 
-				cache_responses=True, 
+		router = Router(model_list=model_list,
+				cache_responses=True,
 				caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")])
 		response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1)
 		print(f"response1: {response1}")
@@ -768,9 +777,9 @@ Step 1. Router Setup
 from litellm import Router
 
 model_list = [
-	{ # list of model deployments 
-		"model_name": "gpt-4-preview", # model alias 
-		"litellm_params": { # params for litellm completion/embedding call 
+	{ # list of model deployments
+		"model_name": "gpt-4-preview", # model alias
+		"litellm_params": { # params for litellm completion/embedding call
 			"model": "azure/chatgpt-v-2", # actual model name
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
@@ -779,11 +788,11 @@ model_list = [
 		"model_info": {
 			"base_model": "azure/gpt-4-1106-preview" # azure/gpt-4-1106-preview will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json
 		}
-	}, 
+	},
 	{
-		"model_name": "gpt-4-32k", 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-functioncalling", 
+		"model_name": "gpt-4-32k",
+		"litellm_params": { # params for litellm completion/embedding call
+			"model": "azure/chatgpt-functioncalling",
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
@@ -804,8 +813,8 @@ Step 2. Access `response_cost` in the custom callback, **litellm calculates the
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 
-class MyCustomHandler(CustomLogger):        
-	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+class MyCustomHandler(CustomLogger):
+	def log_success_event(self, kwargs, response_obj, start_time, end_time):
 		print(f"On Success")
 		response_cost = kwargs.get("response_cost")
 		print("response_cost=", response_cost)
@@ -815,7 +824,7 @@ litellm.callbacks = [customHandler]
 
 # router completion call
 response = router.completion(
-	model="gpt-4-32k", 
+	model="gpt-4-32k",
 	messages=[{ "role": "user", "content": "Hi who are you"}]
 )
 ```
@@ -823,28 +832,28 @@ response = router.completion(
 
 #### Default litellm.completion/embedding params
 
-You can also set default params for litellm completion/embedding calls. Here's how to do that: 
+You can also set default params for litellm completion/embedding calls. Here's how to do that:
 
-```python 
+```python
 from litellm import Router
 
 fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
 
-router = Router(model_list=model_list, 
+router = Router(model_list=model_list,
                 default_litellm_params={"context_window_fallback_dict": fallback_dict})
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call 
+# normal call
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-## Custom Callbacks - Track API Key, API Endpoint, Model Used 
+## Custom Callbacks - Track API Key, API Endpoint, Model Used
 
-If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) 
+If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
 
 ### Usage
 
@@ -852,8 +861,8 @@ If you need to track the api_key, api endpoint, model, custom_llm_provider used
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 
-class MyCustomHandler(CustomLogger):        
-	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+class MyCustomHandler(CustomLogger):
+	def log_success_event(self, kwargs, response_obj, start_time, end_time):
 		print(f"On Success")
 		print("kwargs=", kwargs)
 		litellm_params= kwargs.get("litellm_params")
@@ -868,7 +877,7 @@ class MyCustomHandler(CustomLogger):
 		print("custom_llm_provider=", custom_llm_provider)
 		print("response_cost=", response_cost)
 
-	def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+	def log_failure_event(self, kwargs, response_obj, start_time, end_time):
 		print(f"On Failure")
 		print("kwargs=")
 
@@ -881,12 +890,12 @@ router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 
 # router completion call
 response = router.completion(
-	model="gpt-3.5-turbo", 
+	model="gpt-3.5-turbo",
 	messages=[{ "role": "user", "content": "Hi who are you"}]
 )
 ```
 
-## Deploy Router 
+## Deploy Router
 
 If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
 
@@ -896,7 +905,7 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI
 ```python
 def __init__(
 	model_list: Optional[list] = None,
-	
+
 	## CACHING ##
 	redis_url: Optional[str] = None,
 	redis_host: Optional[str] = None,

From 6e20bb13b26089728dfd164c5840a38658851ce0 Mon Sep 17 00:00:00 2001
From: CLARKBENHAM <clark.benham@gmail.com>
Date: Mon, 8 Apr 2024 21:27:38 -0700
Subject: [PATCH 2/3] Revert "doc pre_call_check: enables router rate limits
 for concurrent calls"

This reverts commit 886c859519c8bd8eddcdeb51c9946036a4042043.
---
 docs/my-website/docs/routing.md | 327 ++++++++++++++++----------------
 1 file changed, 159 insertions(+), 168 deletions(-)

diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index 3fda19094..404c72e44 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -28,40 +28,40 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI
 ```python
 from litellm import Router
 
-model_list = [{ # list of model deployments
+model_list = [{ # list of model deployments 
 	"model_name": "gpt-3.5-turbo", # model alias -> loadbalance between models with same `model_name`
-	"litellm_params": { # params for litellm completion/embedding call
+	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "azure/chatgpt-functioncalling",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "gpt-3.5-turbo",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }, {
-    "model_name": "gpt-4",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "azure/gpt-4",
+    "model_name": "gpt-4", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/gpt-4", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 	}
 }, {
-    "model_name": "gpt-4",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "gpt-4",
+    "model_name": "gpt-4", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-4", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 },
@@ -72,14 +72,14 @@ router = Router(model_list=model_list)
 
 # openai.ChatCompletion.create replacement
 # requests with model="gpt-3.5-turbo" will pick a deployment where model_name="gpt-3.5-turbo"
-response = await router.acompletion(model="gpt-3.5-turbo",
+response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}])
 
 print(response)
 
 # openai.ChatCompletion.create replacement
 # requests with model="gpt-4" will pick a deployment where model_name="gpt-4"
-response = await router.acompletion(model="gpt-4",
+response = await router.acompletion(model="gpt-4", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}])
 
 print(response)
@@ -98,7 +98,7 @@ print(response)
 ### Advanced
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
 
-Router provides 4 strategies for routing your calls across multiple deployments:
+Router provides 4 strategies for routing your calls across multiple deployments: 
 
 <Tabs>
 <TabItem value="latency-based" label="Latency-Based">
@@ -111,16 +111,13 @@ It caches, and updates the response times for deployments based on when a reques
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
 
 ```python
-from litellm import Router
+from litellm import Router 
 import asyncio
 
 model_list = [{ ... }]
 
 # init router
-router = Router(model_list=model_list,
-				routing_strategy="latency-based-routing",# 👈 set routing strategy
-				enable_pre_call_check=True, # enables router rate limits for concurrent calls
-				)
+router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
 
 ## CALL 1+2
 tasks = []
@@ -131,7 +128,7 @@ for _ in range(2):
 response = await asyncio.gather(*tasks)
 
 if response is not None:
-	## CALL 3
+	## CALL 3 
 	await asyncio.sleep(1)  # let the cache update happen
 	picked_deployment = router.lowestlatency_logger.get_available_deployments(
 		model_group=model, healthy_deployments=router.healthy_deployments
@@ -145,12 +142,12 @@ if response is not None:
 	)
 ```
 
-### Set Time Window
+### Set Time Window 
 
-Set time window for how far back to consider when averaging latency for a deployment.
+Set time window for how far back to consider when averaging latency for a deployment. 
 
 **In Router**
-```python
+```python 
 router = Router(..., routing_strategy_args={"ttl": 10})
 ```
 
@@ -169,12 +166,12 @@ router_settings:
 If `rpm` or `tpm` is not provided, it randomly picks a deployment
 
 ```python
-from litellm import Router
+from litellm import Router 
 import asyncio
 
-model_list = [{ # list of model deployments
-	"model_name": "gpt-3.5-turbo", # model alias
-	"litellm_params": { # params for litellm completion/embedding call
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
@@ -182,18 +179,18 @@ model_list = [{ # list of model deployments
 		"rpm": 900,			# requests per minute for this API
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "azure/chatgpt-functioncalling",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"rpm": 10,
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "gpt-3.5-turbo",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 		"rpm": 10,
 	}
@@ -203,7 +200,7 @@ model_list = [{ # list of model deployments
 router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo",
+		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -214,33 +211,33 @@ asyncio.run(router_acompletion())
 </TabItem>
 <TabItem value="usage-based" label="Rate-Limit Aware">
 
-This will route to the deployment with the lowest TPM usage for that minute.
+This will route to the deployment with the lowest TPM usage for that minute. 
 
-In production, we use Redis to track usage (TPM/RPM) across multiple deployments.
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments. 
 
-If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded.
+If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. 
 
-For Azure, your RPM = TPM/6.
+For Azure, your RPM = TPM/6. 
 
 
 ```python
-from litellm import Router
+from litellm import Router 
 
 
-model_list = [{ # list of model deployments
-	"model_name": "gpt-3.5-turbo", # model alias
-	"litellm_params": { # params for litellm completion/embedding call
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
-	},
+	}, 
     "tpm": 100000,
 	"rpm": 10000,
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "azure/chatgpt-functioncalling",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
@@ -248,24 +245,22 @@ model_list = [{ # list of model deployments
     "tpm": 100000,
 	"rpm": 1000,
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "gpt-3.5-turbo",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	},
     "tpm": 100000,
 	"rpm": 1000,
 }]
-router = Router(model_list=model_list,
-                redis_host=os.environ["REDIS_HOST"],
-				redis_password=os.environ["REDIS_PASSWORD"],
-				redis_port=os.environ["REDIS_PORT"],
-                routing_strategy="usage-based-routing",
-				enable_pre_call_check=True, # enables router rate limits for concurrent calls
-								)
+router = Router(model_list=model_list, 
+                redis_host=os.environ["REDIS_HOST"], 
+				redis_password=os.environ["REDIS_PASSWORD"], 
+				redis_port=os.environ["REDIS_PORT"], 
+                routing_strategy="usage-based-routing")
 
 
-response = await router.acompletion(model="gpt-3.5-turbo",
+response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]
 
 print(response)
@@ -281,29 +276,29 @@ Picks a deployment with the least number of ongoing calls, it's handling.
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
 
 ```python
-from litellm import Router
+from litellm import Router 
 import asyncio
 
-model_list = [{ # list of model deployments
-	"model_name": "gpt-3.5-turbo", # model alias
-	"litellm_params": { # params for litellm completion/embedding call
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "azure/chatgpt-functioncalling",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
-    "model_name": "gpt-3.5-turbo",
-	"litellm_params": { # params for litellm completion/embedding call
-		"model": "gpt-3.5-turbo",
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }]
@@ -312,7 +307,7 @@ model_list = [{ # list of model deployments
 router = Router(model_list=model_list, routing_strategy="least-busy")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo",
+		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -327,18 +322,18 @@ asyncio.run(router_acompletion())
 
 ## Basic Reliability
 
-### Timeouts
+### Timeouts 
 
-The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
+The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
 
 **Global Timeouts**
 ```python
-from litellm import Router
+from litellm import Router 
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,
-                timeout=30) # raise timeout error if call takes > 30s
+router = Router(model_list=model_list, 
+                timeout=30) # raise timeout error if call takes > 30s 
 
 print(response)
 ```
@@ -346,7 +341,7 @@ print(response)
 **Timeouts per model**
 
 ```python
-from litellm import Router
+from litellm import Router 
 import asyncio
 
 model_list = [{
@@ -365,7 +360,7 @@ model_list = [{
 router = Router(model_list=model_list, routing_strategy="least-busy")
 async def router_acompletion():
 	response = await router.acompletion(
-		model="gpt-3.5-turbo",
+		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
@@ -375,20 +370,20 @@ asyncio.run(router_acompletion())
 ```
 ### Cooldowns
 
-Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
+Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. 
 
 ```python
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,
-                allowed_fails=1) # cooldown model if it fails > 1 call in a minute.
+router = Router(model_list=model_list, 
+                allowed_fails=1) # cooldown model if it fails > 1 call in a minute. 
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call
+# normal call 
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
@@ -397,55 +392,55 @@ print(f"response: {response}")
 
 ### Retries
 
-For both async + sync functions, we support retrying failed requests.
+For both async + sync functions, we support retrying failed requests. 
 
-For RateLimitError we implement exponential backoffs
+For RateLimitError we implement exponential backoffs 
 
-For generic errors, we retry immediately
+For generic errors, we retry immediately 
 
-Here's a quick look at how we can set `num_retries = 3`:
+Here's a quick look at how we can set `num_retries = 3`: 
 
-```python
+```python 
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,
+router = Router(model_list=model_list,  
                 num_retries=3)
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call
+# normal call 
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param.
+We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. 
 
-```python
+```python 
 from litellm import Router
 
 model_list = [{...}]
 
-router = Router(model_list=model_list,
+router = Router(model_list=model_list,  
                 num_retries=3, retry_after=5) # waits min 5s before retrying request
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call
+# normal call 
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-### Fallbacks
+### Fallbacks 
 
-If a call fails after num_retries, fall back to another model group.
+If a call fails after num_retries, fall back to another model group. 
 
-If the error is a context window exceeded error, fall back to a larger model group (if given).
+If the error is a context window exceeded error, fall back to a larger model group (if given). 
 
 Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
 
@@ -453,52 +448,52 @@ Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'g
 from litellm import Router
 
 model_list = [
-    { # list of model deployments
-		"model_name": "azure/gpt-3.5-turbo", # openai model name
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "azure/chatgpt-v-2",
+    { # list of model deployments 
+		"model_name": "azure/gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	},
-    { # list of model deployments
-		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "azure/chatgpt-v-2",
+	}, 
+    { # list of model deployments 
+		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	},
+	}, 
 	{
-		"model_name": "azure/gpt-3.5-turbo", # openai model name
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "azure/chatgpt-functioncalling",
+		"model_name": "azure/gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-functioncalling", 
 			"api_key": "bad-key",
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
 		},
 		"tpm": 240000,
 		"rpm": 1800
-	},
+	}, 
 	{
-		"model_name": "gpt-3.5-turbo", # openai model name
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "gpt-3.5-turbo",
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo", 
 			"api_key": os.getenv("OPENAI_API_KEY"),
 		},
 		"tpm": 1000000,
 		"rpm": 9000
 	},
     {
-		"model_name": "gpt-3.5-turbo-16k", # openai model name
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "gpt-3.5-turbo-16k",
+		"model_name": "gpt-3.5-turbo-16k", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo-16k", 
 			"api_key": os.getenv("OPENAI_API_KEY"),
 		},
 		"tpm": 1000000,
@@ -507,8 +502,8 @@ model_list = [
 ]
 
 
-router = Router(model_list=model_list,
-                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
+router = Router(model_list=model_list, 
+                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
                 context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
                 set_verbose=True)
 
@@ -516,7 +511,7 @@ router = Router(model_list=model_list,
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal fallback call
+# normal fallback call 
 response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
 
 # context window fallback call
@@ -527,12 +522,12 @@ print(f"response: {response}")
 
 ### Caching
 
-In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
+In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
 
 **In-memory Cache**
 
 ```python
-router = Router(model_list=model_list,
+router = Router(model_list=model_list, 
                 cache_responses=True)
 
 print(response)
@@ -540,19 +535,19 @@ print(response)
 
 **Redis Cache**
 ```python
-router = Router(model_list=model_list,
-                redis_host=os.getenv("REDIS_HOST"),
-                redis_password=os.getenv("REDIS_PASSWORD"),
+router = Router(model_list=model_list, 
+                redis_host=os.getenv("REDIS_HOST"), 
+                redis_password=os.getenv("REDIS_PASSWORD"), 
                 redis_port=os.getenv("REDIS_PORT"),
                 cache_responses=True)
 
 print(response)
 ```
 
-**Pass in Redis URL, additional kwargs**
-```python
+**Pass in Redis URL, additional kwargs** 
+```python 
 router = Router(model_list: Optional[list] = None,
-                 ## CACHING ##
+                 ## CACHING ## 
                  redis_url=os.getenv("REDIS_URL")",
 				 cache_kwargs= {}, # additional kwargs to pass to RedisCache (see caching.py)
 				 cache_responses=True)
@@ -560,18 +555,14 @@ router = Router(model_list: Optional[list] = None,
 
 ## Pre-Call Checks (Context Window)
 
-Enable pre-call checks to filter out:
-1. deployments with context window limit < messages for a call.
-2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
-        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
-    ])`)
+Enable pre-call checks to filter out deployments with context window limit < messages for a call.
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 
 **1. Enable pre-call checks**
-```python
-from litellm import Router
+```python 
+from litellm import Router 
 # ...
 router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
 ```
@@ -579,7 +570,7 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t
 
 **2. Set Model List**
 
-For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
 
 <Tabs>
 <TabItem value="same-group" label="Same Group">
@@ -607,7 +598,7 @@ model_list = [
             },
         ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True)
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
 ```
 
 </TabItem>
@@ -636,7 +627,7 @@ model_list = [
                 },
             },
             {
-                "model_name": "claude-opus",
+                "model_name": "claude-opus", 
                 "litellm_params": {  call
                     "model": "claude-3-opus-20240229",
                     "api_key": os.getenv("ANTHROPIC_API_KEY"),
@@ -644,7 +635,7 @@ model_list = [
             },
         ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}])
+router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) 
 ```
 
 </TabItem>
@@ -673,7 +664,7 @@ model_list = [
 			"api_base": os.getenv("AZURE_API_BASE"),
 		},
 		"model_info": {
-			"base_model": "azure/gpt-35-turbo",
+			"base_model": "azure/gpt-35-turbo", 
 		}
 	},
 	{
@@ -685,7 +676,7 @@ model_list = [
 	},
 ]
 
-router = Router(model_list=model_list, enable_pre_call_checks=True)
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
 
 text = "What is the meaning of 42?" * 5000
 
@@ -710,11 +701,11 @@ Go [here](./proxy/reliability.md#advanced---context-window-fallbacks) for how to
 
 ## Caching across model groups
 
-If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups.
+If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
 
 ```python
 import litellm, asyncio, time
-from litellm import Router
+from litellm import Router 
 
 # set os env
 os.environ["OPENAI_API_KEY"] = ""
@@ -722,8 +713,8 @@ os.environ["AZURE_API_KEY"] = ""
 os.environ["AZURE_API_BASE"] = ""
 os.environ["AZURE_API_VERSION"] = ""
 
-async def test_acompletion_caching_on_router_caching_groups():
-	# tests acompletion + caching on router
+async def test_acompletion_caching_on_router_caching_groups(): 
+	# tests acompletion + caching on router 
 	try:
 		litellm.set_verbose = True
 		model_list = [
@@ -749,8 +740,8 @@ async def test_acompletion_caching_on_router_caching_groups():
 			{"role": "user", "content": f"write a one sentence poem {time.time()}?"}
 		]
 		start_time = time.time()
-		router = Router(model_list=model_list,
-				cache_responses=True,
+		router = Router(model_list=model_list, 
+				cache_responses=True, 
 				caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")])
 		response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1)
 		print(f"response1: {response1}")
@@ -777,9 +768,9 @@ Step 1. Router Setup
 from litellm import Router
 
 model_list = [
-	{ # list of model deployments
-		"model_name": "gpt-4-preview", # model alias
-		"litellm_params": { # params for litellm completion/embedding call
+	{ # list of model deployments 
+		"model_name": "gpt-4-preview", # model alias 
+		"litellm_params": { # params for litellm completion/embedding call 
 			"model": "azure/chatgpt-v-2", # actual model name
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
@@ -788,11 +779,11 @@ model_list = [
 		"model_info": {
 			"base_model": "azure/gpt-4-1106-preview" # azure/gpt-4-1106-preview will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json
 		}
-	},
+	}, 
 	{
-		"model_name": "gpt-4-32k",
-		"litellm_params": { # params for litellm completion/embedding call
-			"model": "azure/chatgpt-functioncalling",
+		"model_name": "gpt-4-32k", 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-functioncalling", 
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE")
@@ -813,8 +804,8 @@ Step 2. Access `response_cost` in the custom callback, **litellm calculates the
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 
-class MyCustomHandler(CustomLogger):
-	def log_success_event(self, kwargs, response_obj, start_time, end_time):
+class MyCustomHandler(CustomLogger):        
+	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
 		print(f"On Success")
 		response_cost = kwargs.get("response_cost")
 		print("response_cost=", response_cost)
@@ -824,7 +815,7 @@ litellm.callbacks = [customHandler]
 
 # router completion call
 response = router.completion(
-	model="gpt-4-32k",
+	model="gpt-4-32k", 
 	messages=[{ "role": "user", "content": "Hi who are you"}]
 )
 ```
@@ -832,28 +823,28 @@ response = router.completion(
 
 #### Default litellm.completion/embedding params
 
-You can also set default params for litellm completion/embedding calls. Here's how to do that:
+You can also set default params for litellm completion/embedding calls. Here's how to do that: 
 
-```python
+```python 
 from litellm import Router
 
 fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
 
-router = Router(model_list=model_list,
+router = Router(model_list=model_list, 
                 default_litellm_params={"context_window_fallback_dict": fallback_dict})
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
-# normal call
+# normal call 
 response = router.completion(model="gpt-3.5-turbo", messages=messages)
 
 print(f"response: {response}")
 ```
 
-## Custom Callbacks - Track API Key, API Endpoint, Model Used
+## Custom Callbacks - Track API Key, API Endpoint, Model Used 
 
-If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
+If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) 
 
 ### Usage
 
@@ -861,8 +852,8 @@ If you need to track the api_key, api endpoint, model, custom_llm_provider used
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 
-class MyCustomHandler(CustomLogger):
-	def log_success_event(self, kwargs, response_obj, start_time, end_time):
+class MyCustomHandler(CustomLogger):        
+	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
 		print(f"On Success")
 		print("kwargs=", kwargs)
 		litellm_params= kwargs.get("litellm_params")
@@ -877,7 +868,7 @@ class MyCustomHandler(CustomLogger):
 		print("custom_llm_provider=", custom_llm_provider)
 		print("response_cost=", response_cost)
 
-	def log_failure_event(self, kwargs, response_obj, start_time, end_time):
+	def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
 		print(f"On Failure")
 		print("kwargs=")
 
@@ -890,12 +881,12 @@ router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 
 # router completion call
 response = router.completion(
-	model="gpt-3.5-turbo",
+	model="gpt-3.5-turbo", 
 	messages=[{ "role": "user", "content": "Hi who are you"}]
 )
 ```
 
-## Deploy Router
+## Deploy Router 
 
 If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
 
@@ -905,7 +896,7 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI
 ```python
 def __init__(
 	model_list: Optional[list] = None,
-
+	
 	## CACHING ##
 	redis_url: Optional[str] = None,
 	redis_host: Optional[str] = None,

From e96d97d9e58dc26568d2b53afbe2abaa97a2913d Mon Sep 17 00:00:00 2001
From: CLARKBENHAM <clark.benham@gmail.com>
Date: Mon, 8 Apr 2024 21:31:21 -0700
Subject: [PATCH 3/3] remove formating changes

---
 docs/my-website/docs/routing.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index 404c72e44..85649eb6b 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -117,7 +117,10 @@ import asyncio
 model_list = [{ ... }]
 
 # init router
-router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
+router = Router(model_list=model_list,
+				routing_strategy="latency-based-routing",# 👈 set routing strategy
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)
 
 ## CALL 1+2
 tasks = []
@@ -257,8 +260,9 @@ router = Router(model_list=model_list,
                 redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="usage-based-routing")
-
+                routing_strategy="usage-based-routing"
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)
 
 response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]
@@ -555,7 +559,11 @@ router = Router(model_list: Optional[list] = None,
 
 ## Pre-Call Checks (Context Window)
 
-Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+Enable pre-call checks to filter out:
+1. deployments with context window limit < messages for a call.
+2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
+        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
+    ])`)
 
 <Tabs>
 <TabItem value="sdk" label="SDK">