From 12dbdc4c15787a7d20596aee91917dd034abb1c6 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 24 Nov 2023 12:20:38 -0800 Subject: [PATCH] docs(simple_proxy.md): add tutorial for doing fallbacks + retries + timeouts on the proxy --- docs/my-website/docs/simple_proxy.md | 36 ++++++++++++++++++++++++++++ litellm/__init__.py | 9 ++++--- litellm/router.py | 12 +++++----- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index a6c5d159d4..718b712171 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -853,6 +853,42 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ ' ``` +### Fallbacks + Retries + Timeouts + +If a call fails after num_retries, fall back to another model group. + +If the error is a context window exceeded error, fall back to a larger model group (if given). + +```yaml +model_list: + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8001 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8002 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8003 + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + api_key: + - model_name: gpt-3.5-turbo-16k + litellm_params: + model: gpt-3.5-turbo-16k + api_key: + +litellm_settings: + num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) + request_timeout: 10 # raise Timeout error if call takes longer than 10s + fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries + context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error +``` + ### Set Custom Prompt Templates LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`: diff --git a/litellm/__init__.py b/litellm/__init__.py index fe6ceede75..e7ad788a01 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -21,7 +21,6 @@ telemetry = True max_tokens = 256 # OpenAI Defaults drop_params = False retry = True -request_timeout: Optional[float] = 6000 api_key: Optional[str] = None openai_key: Optional[str] = None azure_key: Optional[str] = None @@ -50,10 +49,14 @@ error_logs: Dict = {} add_function_to_prompt: bool = False # if function calling not supported by api, append function call details to system prompt client_session: Optional[httpx.Client] = None aclient_session: Optional[httpx.AsyncClient] = None -model_fallbacks: Optional[List] = None +model_fallbacks: Optional[List] = None # Deprecated for 'litellm.fallbacks' model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" -num_retries: Optional[int] = None suppress_debug_info = False +#### RELIABILITY #### +request_timeout: Optional[float] = 6000 +num_retries: Optional[int] = None +fallbacks: Optional[List] = None +context_window_fallbacks: Optional[List] = None ############################################# def get_model_cost_map(url: str): diff --git a/litellm/router.py b/litellm/router.py index 9516940e5c..6c0ddbcb8a 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -64,7 +64,7 @@ class Router: redis_password: Optional[str] = None, cache_responses: bool = False, num_retries: int = 0, - timeout: float = 600, + timeout: Optional[float] = None, default_litellm_params = {}, # default params for Router.chat.completion.create set_verbose: bool = False, fallbacks: List = [], @@ -79,12 +79,12 @@ class Router: for m in model_list: self.deployment_latency_map[m["litellm_params"]["model"]] = 0 - self.num_retries = num_retries - self.set_verbose = set_verbose - self.timeout = timeout + self.num_retries = num_retries or litellm.num_retries + self.set_verbose = set_verbose + self.timeout = timeout or litellm.request_timeout self.routing_strategy = routing_strategy - self.fallbacks = fallbacks - self.context_window_fallbacks = context_window_fallbacks + self.fallbacks = fallbacks or litellm.fallbacks + self.context_window_fallbacks = context_window_fallbacks or litellm.context_window_fallbacks # make Router.chat.completions.create compatible for openai.chat.completions.create self.chat = litellm.Chat(params=default_litellm_params)