From d0d08b4dce58aa3946a0f11a2fc4f5887cbddcc2 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 1 Jan 2024 08:36:40 +0530
Subject: [PATCH] docs(routing.md): adding latency-based routing to docs

---
 docs/my-website/docs/routing.md | 112 ++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 4 deletions(-)
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index 8c58c10e6..a0e30cf16 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -65,13 +65,16 @@ print(response)
 - `router.completion()` - chat completions endpoint to call 100+ LLMs
 - `router.acompletion()` - async chat completion calls
 - `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
-- `router.aembeddings()` - async embeddings endpoint
+- `router.aembeddings()` - async embeddings calls
 - `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
+- `router.atext_completion()` - async text completion calls
+- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
+- `router.aimage_generation()` - async image generation calls
 
 ### Advanced
-#### Routing Strategies - Weighted Pick, Rate Limit Aware
+#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
 
-Router provides 2 strategies for routing your calls across multiple deployments: 
+Router provides 4 strategies for routing your calls across multiple deployments: 
 
 <Tabs>
 <TabItem value="simple-shuffle" label="Weighted Pick">
@@ -172,7 +175,7 @@ router = Router(model_list=model_list,
                 redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="simple-shuffle")
+                routing_strategy="usage-based-routing")
 
 
 response = await router.acompletion(model="gpt-3.5-turbo", 
@@ -182,6 +185,107 @@ print(response)
 ```
 
 
+</TabItem>
+<TabItem value="least-busy" label="Least-Busy">
+
+
+Picks a deployment with the least number of ongoing calls, it's handling.
+
+[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	}
+}]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="least-busy")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+	return response
+
+asyncio.run(router_acompletion())
+```
+
+</TabItem>
+<TabItem value="latency-based" label="Latency-Based">
+
+
+Picks the deployment with the lowest response time.
+
+It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
+
+[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	}
+}]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="latency-based-routing")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+	return response
+
+asyncio.run(router_acompletion())
+```
+
+
 </TabItem>
 </Tabs>