diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index cb3722c229..c60371fdc7 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -1,4 +1,7 @@
import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
# Manage Multiple Deployments
@@ -17,15 +20,15 @@ In production, [Router connects to a Redis Cache](#redis-queue) to track usage a
from litellm import Router
model_list = [{ # list of model deployments
- "model_name": "gpt-3.5-turbo", # openai model name
+ "model_name": "gpt-3.5-turbo", # model alias
"litellm_params": { # params for litellm completion/embedding call
- "model": "azure/chatgpt-v-2",
+ "model": "azure/chatgpt-v-2", # actual model name
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
}
}, {
- "model_name": "gpt-3.5-turbo", # openai model name
+ "model_name": "gpt-3.5-turbo",
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
@@ -33,7 +36,7 @@ model_list = [{ # list of model deployments
"api_base": os.getenv("AZURE_API_BASE")
}
}, {
- "model_name": "gpt-3.5-turbo", # openai model name
+ "model_name": "gpt-3.5-turbo",
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo",
"api_key": os.getenv("OPENAI_API_KEY"),
@@ -43,28 +46,212 @@ model_list = [{ # list of model deployments
router = Router(model_list=model_list)
# openai.ChatCompletion.create replacement
-response = router.completion(model="gpt-3.5-turbo",
+response = await router.acompletion(model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Hey, how's it going?"}])
+
+print(response)
+```
+
+### Available Endpoints
+- `router.completion()` - chat completions endpoint to call 100+ LLMs
+- `router.acompletion()` - async chat completion calls
+- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
+- `router.aembeddings()` - async embeddings endpoint
+- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
+
+### Routing Strategies
+
+Router provides 2 strategies for routing your calls across multiple deployments:
+
+
+
+
+**Default** Randomly picks a deployment to route a call too.
+
+```python
+from litellm import Router
+
+model_list = [{ # list of model deployments
+ "model_name": "gpt-3.5-turbo", # model alias
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "azure/chatgpt-v-2", # actual model name
+ "api_key": os.getenv("AZURE_API_KEY"),
+ "api_version": os.getenv("AZURE_API_VERSION"),
+ "api_base": os.getenv("AZURE_API_BASE")
+ }
+}, {
+ "model_name": "gpt-3.5-turbo",
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "azure/chatgpt-functioncalling",
+ "api_key": os.getenv("AZURE_API_KEY"),
+ "api_version": os.getenv("AZURE_API_VERSION"),
+ "api_base": os.getenv("AZURE_API_BASE")
+ }
+}, {
+ "model_name": "gpt-3.5-turbo",
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "gpt-3.5-turbo",
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ }
+}]
+
+
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+
+
+response = await router.acompletion(model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Hey, how's it going?"}])
+
+print(response)
+```
+
+
+
+This will route to the deployment with the lowest TPM usage for that minute.
+
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments.
+
+If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded.
+
+For Azure, your RPM = TPM/6.
+
+
+```python
+from litellm import Router
+
+
+model_list = [{ # list of model deployments
+ "model_name": "gpt-3.5-turbo", # model alias
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "azure/chatgpt-v-2", # actual model name
+ "api_key": os.getenv("AZURE_API_KEY"),
+ "api_version": os.getenv("AZURE_API_VERSION"),
+ "api_base": os.getenv("AZURE_API_BASE")
+ },
+ "tpm": 100000,
+ "rpm": 10000,
+}, {
+ "model_name": "gpt-3.5-turbo",
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "azure/chatgpt-functioncalling",
+ "api_key": os.getenv("AZURE_API_KEY"),
+ "api_version": os.getenv("AZURE_API_VERSION"),
+ "api_base": os.getenv("AZURE_API_BASE")
+ },
+ "tpm": 100000,
+ "rpm": 1000,
+}, {
+ "model_name": "gpt-3.5-turbo",
+ "litellm_params": { # params for litellm completion/embedding call
+ "model": "gpt-3.5-turbo",
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ },
+ "tpm": 100000,
+ "rpm": 1000,
+}]
+router = Router(model_list=model_list,
+ redis_host=os.environ["REDIS_HOST"],
+ redis_password=os.environ["REDIS_PASSWORD"],
+ redis_port=os.environ["REDIS_PORT"],
+ routing_strategy="usage-based-routing")
+
+
+response = await router.acompletion(model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
print(response)
```
-### Redis Queue
-In production, we use Redis to track usage across multiple Azure deployments.
+
+
+
+
+### Caching + Request Timeouts
+
+In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
+
+**In-memory Cache + Timeouts**
```python
router = Router(model_list=model_list,
- redis_host=os.getenv("REDIS_HOST"),
- redis_password=os.getenv("REDIS_PASSWORD"),
- redis_port=os.getenv("REDIS_PORT"))
+ cache_responses=True,
+ timeout=30) # timeout set to 30s
print(response)
```
+**Redis Cache + Timeouts**
+```python
+router = Router(model_list=model_list,
+ redis_host=os.getenv("REDIS_HOST"),
+ redis_password=os.getenv("REDIS_PASSWORD"),
+ redis_port=os.getenv("REDIS_PORT"),
+ cache_responses=True,
+ timeout=30)
+
+print(response)
+```
+
+### Retry failed requests
+
+For both async + sync functions, we support retrying failed requests.
+
+If it's a RateLimitError we implement exponential backoffs
+
+If it's a generic OpenAI API Error, we retry immediately
+
+For any other exception types, we don't retry
+
+Here's a quick look at how we can set `num_retries = 3`:
+
+```python
+from litellm import Router
+
+router = Router(model_list=model_list,
+ cache_responses=True,
+ timeout=30,
+ num_retries=3)
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call
+response = router.completion(model="gpt-3.5-turbo", messages=messages)
+
+print(f"response: {response}")
+```
+
+### Default litellm.completion/embedding params
+
+You can also set default params for litellm completion/embedding calls. Here's how to do that:
+
+```python
+from litellm import Router
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+
+router = Router(model_list=model_list,
+ default_litellm_params={"context_window_fallback_dict": fallback_dict})
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call
+response = router.completion(model="gpt-3.5-turbo", messages=messages)
+
+print(f"response: {response}")
+```
+
+
### Deploy Router
-If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md)
+If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model)
+
+
+## litellm.completion()
+
+If you're calling litellm.completion(), here's the different reliability options you can enable.
## Retry failed requests
@@ -103,7 +290,7 @@ from litellm import completion
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
-completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict)
```
### Fallbacks - Switch Models/API Keys/API Bases
diff --git a/litellm/router.py b/litellm/router.py
index 02096d81ba..a8897ed537 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -11,7 +11,7 @@ class Router:
Example usage:
from litellm import Router
model_list = [{
- "model_name": "gpt-3.5-turbo", # openai model name
+ "model_name": "gpt-3.5-turbo", # model alias
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/",
"api_key": ,
@@ -47,9 +47,9 @@ class Router:
self.chat = litellm.Chat(params=default_litellm_params)
- self.default_litellm_params = {
- "timeout": timeout
- }
+ self.default_litellm_params = default_litellm_params
+ self.default_litellm_params["timeout"] = timeout
+
self.routing_strategy = routing_strategy
### HEALTH CHECK THREAD ###
if self.routing_strategy == "least-busy":