From f1dbdb58bbb40e3bbe12051e8b0e3cce24a54498 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 24 Nov 2023 13:37:16 -0800 Subject: [PATCH] docs(simple_proxy.md): add cooldown to docs --- docs/my-website/docs/routing.md | 30 ++++++++++++++++++++++++---- docs/my-website/docs/simple_proxy.md | 5 ++++- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index a8f7b16c1c..03d95e9ef2 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -188,11 +188,33 @@ from litellm import Router model_list = [{...}] router = Router(model_list=model_list, - timeout=30) # timeout set to 30s + timeout=30) # raise timeout error if call takes > 30s print(response) ``` +### Cooldowns + +Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. + +```python +from litellm import Router + +model_list = [{...}] + +router = Router(model_list=model_list, + allowed_fails=1) # cooldown model if it fails > 1 call in a minute. + +user_message = "Hello, whats the weather in San Francisco??" +messages = [{"content": user_message, "role": "user"}] + +# normal call +response = router.completion(model="gpt-3.5-turbo", messages=messages) + +print(f"response: {response}") + +``` + ### Retries For both async + sync functions, we support retrying failed requests. @@ -206,9 +228,9 @@ Here's a quick look at how we can set `num_retries = 3`: ```python from litellm import Router -router = Router(model_list=model_list, - cache_responses=True, - timeout=30, +model_list = [{...}] + +router = Router(model_list=model_list, num_retries=3) user_message = "Hello, whats the weather in San Francisco??" diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index 718b712171..fd0884420b 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -853,12 +853,14 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ ' ``` -### Fallbacks + Retries + Timeouts +### Fallbacks + Cooldowns + Retries + Timeouts If a call fails after num_retries, fall back to another model group. If the error is a context window exceeded error, fall back to a larger model group (if given). +[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) + ```yaml model_list: - model_name: zephyr-beta @@ -887,6 +889,7 @@ litellm_settings: request_timeout: 10 # raise Timeout error if call takes longer than 10s fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. ``` ### Set Custom Prompt Templates