docs(routing.md): adding latency-based routing to docs

2024-01-01 08:36:40 +05:30 · 2024-01-01 08:36:40 +05:30 · d0d08b4dce
commit d0d08b4dce
parent d71f89aac3
1 changed files with 108 additions and 4 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -65,13 +65,16 @@ print(response)
 - `router.completion()` - chat completions endpoint to call 100+ LLMs
 - `router.acompletion()` - async chat completion calls
 - `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
- `router.aembeddings()` - async embeddings endpoint
+- `router.aembeddings()` - async embeddings calls
 - `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
 - `router.atext_completion()` - async text completion calls
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls
 ### Advanced
-#### Routing Strategies - Weighted Pick, Rate Limit Aware
+#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
-Router provides 2 strategies for routing your calls across multiple deployments: 
+Router provides 4 strategies for routing your calls across multiple deployments: 
 <Tabs>
 <TabItem value="simple-shuffle" label="Weighted Pick">
@ -172,7 +175,7 @@ router = Router(model_list=model_list,
                redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="simple-shuffle")
+                routing_strategy="usage-based-routing")
 response = await router.acompletion(model="gpt-3.5-turbo", 
@ -182,6 +185,107 @@ print(response)
 ```
 </TabItem>
 <TabItem value="least-busy" label="Least-Busy">
 Picks a deployment with the least number of ongoing calls, it's handling.
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
 ```python
 from litellm import Router 
 import asyncio
 model_list = [{ # list of model deployments 
 	"model_name": "gpt-3.5-turbo", # model alias 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }]
 # init router
 router = Router(model_list=model_list, routing_strategy="least-busy")
 async def router_acompletion():
 	response = await router.acompletion(
 		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
 	return response
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 <TabItem value="latency-based" label="Latency-Based">
 Picks the deployment with the lowest response time.
 It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
 [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
 ```python
 from litellm import Router 
 import asyncio
 model_list = [{ # list of model deployments 
 	"model_name": "gpt-3.5-turbo", # model alias 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "azure/chatgpt-functioncalling", 
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 	}
 }, {
    "model_name": "gpt-3.5-turbo", 
 	"litellm_params": { # params for litellm completion/embedding call 
 		"model": "gpt-3.5-turbo", 
 		"api_key": os.getenv("OPENAI_API_KEY"),
 	}
 }]
 # init router
 router = Router(model_list=model_list, routing_strategy="latency-based-routing")
 async def router_acompletion():
 	response = await router.acompletion(
 		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
 	return response
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 </Tabs>