From 2cc6acdeec281d4bcdf1523f6c3f7ba2715c0a13 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 23 Nov 2023 17:17:11 -0800
Subject: [PATCH] docs(routing.md): add docs on fallbacks, caching, retries,
 timeouts for router

---
 docs/my-website/docs/routing.md | 147 +++++++++++++++++++++++++++-----
 1 file changed, 124 insertions(+), 23 deletions(-)
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index b1f549a584..a8f7b16c1c 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -12,6 +12,13 @@ LiteLLM manages:
 
 In production, litellm supports using Redis as a way to track cooldown server and usage (managing tpm/rpm limits).
 
+:::info
+
+If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
+
+:::
+
+
 ## Load Balancing
 (s/o [@paulpierre](https://www.linkedin.com/in/paulpierre/) for his contribution to this implementation)
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
@@ -169,41 +176,30 @@ print(response)
 </TabItem>
 </Tabs>
 
-#### Caching + Request Timeouts 
+## Basic Reliability
 
-In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
+### Timeouts 
 
-**In-memory Cache + Timeouts**
+The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
 
 ```python
+from litellm import Router 
+
+model_list = [{...}]
+
 router = Router(model_list=model_list, 
-                cache_responses=True, 
                 timeout=30) # timeout set to 30s 
 
 print(response)
 ```
 
-**Redis Cache + Timeouts**
-```python
-router = Router(model_list=model_list, 
-                redis_host=os.getenv("REDIS_HOST"), 
-                redis_password=os.getenv("REDIS_PASSWORD"), 
-                redis_port=os.getenv("REDIS_PORT"),
-                cache_responses=True, 
-                timeout=30)
-
-print(response)
-```
-
-#### Retry failed requests
+### Retries
 
 For both async + sync functions, we support retrying failed requests. 
 
-If it's a RateLimitError we implement exponential backoffs 
+For RateLimitError we implement exponential backoffs 
 
-If it's a generic OpenAI API Error, we retry immediately 
-
-For any other exception types, we don't retry
+For generic errors, we retry immediately 
 
 Here's a quick look at how we can set `num_retries = 3`: 
 
@@ -224,6 +220,111 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
 print(f"response: {response}")
 ```
 
+### Fallbacks 
+
+If a call fails after num_retries, fall back to another model group. 
+
+If the error is a context window exceeded error, fall back to a larger model group (if given). 
+
+```python
+from litellm import Router
+
+model_list = [
+    { # list of model deployments 
+		"model_name": "azure/gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
+			"api_key": "bad-key",
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE")
+		},
+		"tpm": 240000,
+		"rpm": 1800
+	}, 
+    { # list of model deployments 
+		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
+			"api_key": "bad-key",
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE")
+		},
+		"tpm": 240000,
+		"rpm": 1800
+	}, 
+	{
+		"model_name": "azure/gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-functioncalling", 
+			"api_key": "bad-key",
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE")
+		},
+		"tpm": 240000,
+		"rpm": 1800
+	}, 
+	{
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo", 
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+		"tpm": 1000000,
+		"rpm": 9000
+	},
+    {
+		"model_name": "gpt-3.5-turbo-16k", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo-16k", 
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+		"tpm": 1000000,
+		"rpm": 9000
+	}
+]
+
+
+router = Router(model_list=model_list, 
+                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
+                set_verbose=True)
+
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal fallback call 
+response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
+
+# context window fallback call
+response = router.completion(model="azure/gpt-3.5-turbo-context-fallback", messages=messages)
+
+print(f"response: {response}")
+```
+
+### Caching
+
+In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
+
+**In-memory Cache**
+
+```python
+router = Router(model_list=model_list, 
+                cache_responses=True)
+
+print(response)
+```
+
+**Redis Cache**
+```python
+router = Router(model_list=model_list, 
+                redis_host=os.getenv("REDIS_HOST"), 
+                redis_password=os.getenv("REDIS_PASSWORD"), 
+                redis_port=os.getenv("REDIS_PORT"),
+                cache_responses=True)
+
+print(response)
+```
 #### Default litellm.completion/embedding params
 
 You can also set default params for litellm completion/embedding calls. Here's how to do that: 
@@ -246,9 +347,9 @@ print(f"response: {response}")
 ```
 
 
-#### Deploy Router 
+## Deploy Router 
 
-If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model)
+If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
 
 ## Queuing (Beta)