docs(routing.md): updating docs for managing multiple deployments

2025-04-25 18:54:30 +00:00 · 2023-11-18 19:02:50 -08:00 · 2023-11-18 19:02:50 -08:00 · 9e88014ff4
commit 9e88014ff4
parent d8079907b9
2 changed files with 203 additions and 16 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -1,4 +1,7 @@
 import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+

 # Manage Multiple Deployments

@ -17,15 +20,15 @@ In production, [Router connects to a Redis Cache](#redis-queue) to track usage a
 from litellm import Router

 model_list = [{ # list of model deployments 
-	"model_name": "gpt-3.5-turbo", # openai model name 
+	"model_name": "gpt-3.5-turbo", # model alias 
 	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-v-2", 
+		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", # openai model name 
+    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
@ -33,7 +36,7 @@ model_list = [{ # list of model deployments
 		"api_base": os.getenv("AZURE_API_BASE")
 	}
 }, {
-    "model_name": "gpt-3.5-turbo", # openai model name 
+    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
@ -43,28 +46,212 @@ model_list = [{ # list of model deployments
 router = Router(model_list=model_list)

 # openai.ChatCompletion.create replacement
-response = router.completion(model="gpt-3.5-turbo", 
+response = await router.acompletion(model="gpt-3.5-turbo", 
+				messages=[{"role": "user", "content": "Hey, how's it going?"}])
+
+print(response)
+```
+
+### Available Endpoints
+- `router.completion()` - chat completions endpoint to call 100+ LLMs
+- `router.acompletion()` - async chat completion calls
+- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
+- `router.aembeddings()` - async embeddings endpoint
+- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
+
+### Routing Strategies 
+
+Router provides 2 strategies for routing your calls across multiple deployments: 
+
+<Tabs>
+<TabItem value="simple-shuffle" label="Simple Shuffle">
+
+**Default** Randomly picks a deployment to route a call too.
+
+```python
+from litellm import Router 
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	}
+}]
+
+
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+
+
+response = await router.acompletion(model="gpt-3.5-turbo", 
+				messages=[{"role": "user", "content": "Hey, how's it going?"}])
+
+print(response)
+```
+</TabItem>
+<TabItem value="usage-based" label="Rate-Limit Aware">
+
+This will route to the deployment with the lowest TPM usage for that minute. 
+
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments. 
+
+If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. 
+
+For Azure, your RPM = TPM/6. 
+
+
+```python
+from litellm import Router 
+
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	}, 
+    "tpm": 100000,
+	"rpm": 10000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}]
+router = Router(model_list=model_list, 
+                redis_host=os.environ["REDIS_HOST"], 
+				redis_password=os.environ["REDIS_PASSWORD"], 
+				redis_port=os.environ["REDIS_PORT"], 
+                routing_strategy="usage-based-routing")
+
+
+response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]

 print(response)
 ```

-### Redis Queue 

-In production, we use Redis to track usage across multiple Azure deployments.
+</TabItem>
+</Tabs>
+
+
+### Caching + Request Timeouts 
+
+In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
+
+**In-memory Cache + Timeouts**

 ```python
 router = Router(model_list=model_list, 
-                redis_host=os.getenv("REDIS_HOST"), 
-                redis_password=os.getenv("REDIS_PASSWORD"), 
-                redis_port=os.getenv("REDIS_PORT"))
+                cache_responses=True, 
+                timeout=30) # timeout set to 30s 

 print(response)
 ```

+**Redis Cache + Timeouts**
+```python
+router = Router(model_list=model_list, 
+                redis_host=os.getenv("REDIS_HOST"), 
+                redis_password=os.getenv("REDIS_PASSWORD"), 
+                redis_port=os.getenv("REDIS_PORT"),
+                cache_responses=True, 
+                timeout=30)
+
+print(response)
+```
+
+### Retry failed requests
+
+For both async + sync functions, we support retrying failed requests. 
+
+If it's a RateLimitError we implement exponential backoffs 
+
+If it's a generic OpenAI API Error, we retry immediately 
+
+For any other exception types, we don't retry
+
+Here's a quick look at how we can set `num_retries = 3`: 
+
+```python 
+from litellm import Router
+
+router = Router(model_list=model_list, 
+                cache_responses=True, 
+                timeout=30, 
+                num_retries=3)
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call 
+response = router.completion(model="gpt-3.5-turbo", messages=messages)
+
+print(f"response: {response}")
+```
+
+### Default litellm.completion/embedding params
+
+You can also set default params for litellm completion/embedding calls. Here's how to do that: 
+
+```python 
+from litellm import Router
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+
+router = Router(model_list=model_list, 
+                default_litellm_params={"context_window_fallback_dict": fallback_dict})
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call 
+response = router.completion(model="gpt-3.5-turbo", messages=messages)
+
+print(f"response: {response}")
+```
+
+
 ### Deploy Router 

-If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md)
+If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model)
+
+
+## litellm.completion() 
+
+If you're calling litellm.completion(), here's the different reliability options you can enable. 

 ## Retry failed requests

@ -103,7 +290,7 @@ from litellm import completion
 fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
 messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]

-completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict)
 ```

 ### Fallbacks - Switch Models/API Keys/API Bases
--- a/litellm/router.py
+++ b/litellm/router.py
@ -11,7 +11,7 @@ class Router:
    Example usage:
    from litellm import Router
    model_list = [{
-        "model_name": "gpt-3.5-turbo", # openai model name
+        "model_name": "gpt-3.5-turbo", # model alias 
        "litellm_params": { # params for litellm completion/embedding call
            "model": "azure/<your-deployment-name>",
            "api_key": <your-api-key>,
@ -47,9 +47,9 @@ class Router:
        
        self.chat = litellm.Chat(params=default_litellm_params)

-        self.default_litellm_params = {
-            "timeout": timeout
-        }
+        self.default_litellm_params = default_litellm_params
+        self.default_litellm_params["timeout"] = timeout
+        
        self.routing_strategy = routing_strategy
        ### HEALTH CHECK THREAD ###
        if self.routing_strategy == "least-busy":