docs(simple_proxy.md): add tutorial for doing fallbacks + retries + timeouts on the proxy

This commit is contained in:
Krrish Dholakia 2023-11-24 12:20:38 -08:00
parent 3dcbf6197a
commit 12dbdc4c15
3 changed files with 48 additions and 9 deletions

View file

@ -853,6 +853,42 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
'
```
### Fallbacks + Retries + Timeouts
If a call fails after num_retries, fall back to another model group.
If the error is a context window exceeded error, fall back to a larger model group (if given).
```yaml
model_list:
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8001
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8002
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8003
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: <my-openai-key>
- model_name: gpt-3.5-turbo-16k
litellm_params:
model: gpt-3.5-turbo-16k
api_key: <my-openai-key>
litellm_settings:
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
request_timeout: 10 # raise Timeout error if call takes longer than 10s
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
```
### Set Custom Prompt Templates
LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`:

View file

@ -21,7 +21,6 @@ telemetry = True
max_tokens = 256 # OpenAI Defaults
drop_params = False
retry = True
request_timeout: Optional[float] = 6000
api_key: Optional[str] = None
openai_key: Optional[str] = None
azure_key: Optional[str] = None
@ -50,10 +49,14 @@ error_logs: Dict = {}
add_function_to_prompt: bool = False # if function calling not supported by api, append function call details to system prompt
client_session: Optional[httpx.Client] = None
aclient_session: Optional[httpx.AsyncClient] = None
model_fallbacks: Optional[List] = None
model_fallbacks: Optional[List] = None # Deprecated for 'litellm.fallbacks'
model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
num_retries: Optional[int] = None
suppress_debug_info = False
#### RELIABILITY ####
request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None
fallbacks: Optional[List] = None
context_window_fallbacks: Optional[List] = None
#############################################
def get_model_cost_map(url: str):

View file

@ -64,7 +64,7 @@ class Router:
redis_password: Optional[str] = None,
cache_responses: bool = False,
num_retries: int = 0,
timeout: float = 600,
timeout: Optional[float] = None,
default_litellm_params = {}, # default params for Router.chat.completion.create
set_verbose: bool = False,
fallbacks: List = [],
@ -79,12 +79,12 @@ class Router:
for m in model_list:
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
self.num_retries = num_retries
self.set_verbose = set_verbose
self.timeout = timeout
self.num_retries = num_retries or litellm.num_retries
self.set_verbose = set_verbose
self.timeout = timeout or litellm.request_timeout
self.routing_strategy = routing_strategy
self.fallbacks = fallbacks
self.context_window_fallbacks = context_window_fallbacks
self.fallbacks = fallbacks or litellm.fallbacks
self.context_window_fallbacks = context_window_fallbacks or litellm.context_window_fallbacks
# make Router.chat.completions.create compatible for openai.chat.completions.create
self.chat = litellm.Chat(params=default_litellm_params)