docs(routing.md): make title more clear

2025-04-24 18:24:20 +00:00 · 2023-10-18 16:39:05 -07:00 · 2023-10-18 16:39:05 -07:00 · 2e5db47ad0
commit 2e5db47ad0
parent 07b6b2f44e
2 changed files with 69 additions and 2 deletions
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -58,4 +58,71 @@ response = litellm.completion(
 | gpt-3.5-turbo-0301    | `completion('azure/<your deployment name>', messages)` |
 | gpt-3.5-turbo-0613    | `completion('azure/<your deployment name>', messages)` |
 | gpt-3.5-turbo-16k    | `completion('azure/<your deployment name>', messages)` |
-| gpt-3.5-turbo-16k-0613    | `completion('azure/<your deployment name>', messages)`
+| gpt-3.5-turbo-16k-0613    | `completion('azure/<your deployment name>', messages)`
+
+## Azure API Load-Balancing
+
+Use this if you're trying to load-balance across multiple Azure/OpenAI deployments. 
+
+`Router` prevents failed requests, by picking the deployment which is below rate-limit and has the least amount of tokens used. 
+
+In production, [Router connects to a Redis Cache](#redis-queue) to track usage across multiple deployments.
+
+### Quick Start
+
+```python
+pip install litellm
+```
+
+```python
+from litellm import Router
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # openai model name 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	},
+	"tpm": 240000,
+	"rpm": 1800
+}, {
+    "model_name": "gpt-3.5-turbo", # openai model name 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	},
+	"tpm": 240000,
+	"rpm": 1800
+}, {
+    "model_name": "gpt-3.5-turbo", # openai model name 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	},
+	"tpm": 1000000,
+	"rpm": 9000
+}]
+
+router = Router(model_list=model_list)
+
+# openai.ChatCompletion.create replacement
+response = router.completion(model="gpt-3.5-turbo", 
+				messages=[{"role": "user", "content": "Hey, how's it going?"}]
+
+print(response)
+```
+
+### Redis Queue 
+
+```python
+router = Router(model_list=model_list, 
+                redis_host=os.getenv("REDIS_HOST"), 
+                redis_password=os.getenv("REDIS_PASSWORD"), 
+                redis_port=os.getenv("REDIS_PORT"))
+
+print(response)
+```
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -1,4 +1,4 @@
-# LLM API Load-Balancing
+# Azure API Load-Balancing

 Use this if you're trying to load-balance across multiple Azure/OpenAI deployments.