diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index cb3722c229..c60371fdc7 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -1,4 +1,7 @@ import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Manage Multiple Deployments @@ -17,15 +20,15 @@ In production, [Router connects to a Redis Cache](#redis-queue) to track usage a from litellm import Router model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # openai model name + "model_name": "gpt-3.5-turbo", # model alias "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", + "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", # openai model name + "model_name": "gpt-3.5-turbo", "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), @@ -33,7 +36,7 @@ model_list = [{ # list of model deployments "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", # openai model name + "model_name": "gpt-3.5-turbo", "litellm_params": { # params for litellm completion/embedding call "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), @@ -43,28 +46,212 @@ model_list = [{ # list of model deployments router = Router(model_list=model_list) # openai.ChatCompletion.create replacement -response = router.completion(model="gpt-3.5-turbo", +response = await router.acompletion(model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}]) + +print(response) +``` + +### Available Endpoints +- `router.completion()` - chat completions endpoint to call 100+ LLMs +- `router.acompletion()` - async chat completion calls +- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints +- `router.aembeddings()` - async embeddings endpoint +- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format + +### Routing Strategies + +Router provides 2 strategies for routing your calls across multiple deployments: + + + + +**Default** Randomly picks a deployment to route a call too. + +```python +from litellm import Router + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + } +}] + + +router = Router(model_list=model_list, routing_strategy="simple-shuffle") + + +response = await router.acompletion(model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}]) + +print(response) +``` + + + +This will route to the deployment with the lowest TPM usage for that minute. + +In production, we use Redis to track usage (TPM/RPM) across multiple deployments. + +If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. + +For Azure, your RPM = TPM/6. + + +```python +from litellm import Router + + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + }, + "tpm": 100000, + "rpm": 10000, +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + }, + "tpm": 100000, + "rpm": 1000, +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + "tpm": 100000, + "rpm": 1000, +}] +router = Router(model_list=model_list, + redis_host=os.environ["REDIS_HOST"], + redis_password=os.environ["REDIS_PASSWORD"], + redis_port=os.environ["REDIS_PORT"], + routing_strategy="usage-based-routing") + + +response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] print(response) ``` -### Redis Queue -In production, we use Redis to track usage across multiple Azure deployments. + + + + +### Caching + Request Timeouts + +In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. + +**In-memory Cache + Timeouts** ```python router = Router(model_list=model_list, - redis_host=os.getenv("REDIS_HOST"), - redis_password=os.getenv("REDIS_PASSWORD"), - redis_port=os.getenv("REDIS_PORT")) + cache_responses=True, + timeout=30) # timeout set to 30s print(response) ``` +**Redis Cache + Timeouts** +```python +router = Router(model_list=model_list, + redis_host=os.getenv("REDIS_HOST"), + redis_password=os.getenv("REDIS_PASSWORD"), + redis_port=os.getenv("REDIS_PORT"), + cache_responses=True, + timeout=30) + +print(response) +``` + +### Retry failed requests + +For both async + sync functions, we support retrying failed requests. + +If it's a RateLimitError we implement exponential backoffs + +If it's a generic OpenAI API Error, we retry immediately + +For any other exception types, we don't retry + +Here's a quick look at how we can set `num_retries = 3`: + +```python +from litellm import Router + +router = Router(model_list=model_list, + cache_responses=True, + timeout=30, + num_retries=3) + +user_message = "Hello, whats the weather in San Francisco??" +messages = [{"content": user_message, "role": "user"}] + +# normal call +response = router.completion(model="gpt-3.5-turbo", messages=messages) + +print(f"response: {response}") +``` + +### Default litellm.completion/embedding params + +You can also set default params for litellm completion/embedding calls. Here's how to do that: + +```python +from litellm import Router + +fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} + +router = Router(model_list=model_list, + default_litellm_params={"context_window_fallback_dict": fallback_dict}) + +user_message = "Hello, whats the weather in San Francisco??" +messages = [{"content": user_message, "role": "user"}] + +# normal call +response = router.completion(model="gpt-3.5-turbo", messages=messages) + +print(f"response: {response}") +``` + + ### Deploy Router -If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md) +If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model) + + +## litellm.completion() + +If you're calling litellm.completion(), here's the different reliability options you can enable. ## Retry failed requests @@ -103,7 +290,7 @@ from litellm import completion fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}] -completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict) +completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict) ``` ### Fallbacks - Switch Models/API Keys/API Bases diff --git a/litellm/router.py b/litellm/router.py index 02096d81ba..a8897ed537 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -11,7 +11,7 @@ class Router: Example usage: from litellm import Router model_list = [{ - "model_name": "gpt-3.5-turbo", # openai model name + "model_name": "gpt-3.5-turbo", # model alias "litellm_params": { # params for litellm completion/embedding call "model": "azure/", "api_key": , @@ -47,9 +47,9 @@ class Router: self.chat = litellm.Chat(params=default_litellm_params) - self.default_litellm_params = { - "timeout": timeout - } + self.default_litellm_params = default_litellm_params + self.default_litellm_params["timeout"] = timeout + self.routing_strategy = routing_strategy ### HEALTH CHECK THREAD ### if self.routing_strategy == "least-busy":