From d0d08b4dce58aa3946a0f11a2fc4f5887cbddcc2 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 1 Jan 2024 08:36:40 +0530 Subject: [PATCH] docs(routing.md): adding latency-based routing to docs --- docs/my-website/docs/routing.md | 112 ++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 8c58c10e6..a0e30cf16 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -65,13 +65,16 @@ print(response) - `router.completion()` - chat completions endpoint to call 100+ LLMs - `router.acompletion()` - async chat completion calls - `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints -- `router.aembeddings()` - async embeddings endpoint +- `router.aembeddings()` - async embeddings calls - `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format +- `router.atext_completion()` - async text completion calls +- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format +- `router.aimage_generation()` - async image generation calls ### Advanced -#### Routing Strategies - Weighted Pick, Rate Limit Aware +#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based -Router provides 2 strategies for routing your calls across multiple deployments: +Router provides 4 strategies for routing your calls across multiple deployments: @@ -172,7 +175,7 @@ router = Router(model_list=model_list, redis_host=os.environ["REDIS_HOST"], redis_password=os.environ["REDIS_PASSWORD"], redis_port=os.environ["REDIS_PORT"], - routing_strategy="simple-shuffle") + routing_strategy="usage-based-routing") response = await router.acompletion(model="gpt-3.5-turbo", @@ -182,6 +185,107 @@ print(response) ``` + + + + +Picks a deployment with the least number of ongoing calls, it's handling. + +[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py) + +```python +from litellm import Router +import asyncio + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + } +}] + +# init router +router = Router(model_list=model_list, routing_strategy="least-busy") +async def router_acompletion(): + response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + ) + print(response) + return response + +asyncio.run(router_acompletion()) +``` + + + + + +Picks the deployment with the lowest response time. + +It caches, and updates the response times for deployments based on when a request was sent and received from a deployment. + +[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) + +```python +from litellm import Router +import asyncio + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + } +}] + +# init router +router = Router(model_list=model_list, routing_strategy="latency-based-routing") +async def router_acompletion(): + response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + ) + print(response) + return response + +asyncio.run(router_acompletion()) +``` + +