docs(routing.md): adding context window fallback dict and num retries

This commit is contained in:
Krrish Dholakia 2023-11-01 13:52:12 -07:00
parent 270fc02578
commit fb4be198ee
5 changed files with 237 additions and 45 deletions

View file

@ -84,8 +84,11 @@ def completion(
api_base: Optional[str] = None, api_base: Optional[str] = None,
api_version: Optional[str] = None, api_version: Optional[str] = None,
api_key: Optional[str] = None, api_key: Optional[str] = None,
num_retries: Optional[int] = None, # set to retry a model if an APIError, TimeoutError, or ServiceUnavailableError occurs
context_window_fallback_dict: Optional[dict] = None, # mapping of model to use if call fails due to context window error
fallbacks: Optional[list] = None, # pass in a list of api_base,keys, etc. fallbacks: Optional[list] = None, # pass in a list of api_base,keys, etc.
metadata: Optional[dict] = None # additional call metadata, passed to logging integrations / custom callbacks metadata: Optional[dict] = None # additional call metadata, passed to logging integrations / custom callbacks
**kwargs, **kwargs,
) -> ModelResponse: ) -> ModelResponse:
@ -143,10 +146,16 @@ def completion(
- `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds) - `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds)
#### litellm-specific params
- `api_base`: *string (optional)* - The api endpoint you want to call the model with - `api_base`: *string (optional)* - The api endpoint you want to call the model with
- `api_version`: *string (optional)* - (Azure-specific) the api version for the call - `api_version`: *string (optional)* - (Azure-specific) the api version for the call
- `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs
- `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error
- `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails - `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails
- `metadata`: *dict (optional)* - Any additional data you want to be logged when the call is made (sent to logging integrations, eg. promptlayer and accessible via custom callback function) - `metadata`: *dict (optional)* - Any additional data you want to be logged when the call is made (sent to logging integrations, eg. promptlayer and accessible via custom callback function)

View file

@ -1,62 +1,53 @@
# Reliability # Reliability
LiteLLM helps prevent failed requests in 2 ways:
- Retries
- Fallbacks: Context Window + General
## Helper utils ## Helper utils
LiteLLM supports the following functions for reliability: LiteLLM supports the following functions for reliability:
* `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents * `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents
* `completion_with_retries`: use tenacity retries * `num_retries`: use tenacity retries
* `completion()` with fallbacks: switch between models/keys/api bases in case of errors. * `completion()` with fallbacks: switch between models/keys/api bases in case of errors.
## Context Window Errors
```python
from litellm import longer_context_model_fallback_dict, ContextWindowExceededError
sample_text = "how does a court case get to the Supreme Court?" * 1000
messages = [{"content": user_message, "role": "user"}]
model = "gpt-3.5-turbo"
try:
# try the original model
response = completion(model=model, messages=messages)
# catch the context window error
except ContextWindowExceededError as e:
if model in longer_context_model_fallback_dict:
# switch to the equivalent larger model -> gpt.3.5-turbo-16k
new_model = longer_context_model_fallback_dict[model]
response = completion(new_model, messages)
print(response)
```
## Retry failed requests ## Retry failed requests
You can use this as a drop-in replacement for the `completion()` function to use tenacity retries - by default we retry the call 3 times. Call it in completion like this `completion(..num_retries=2)`.
Here's a quick look at how you can use it: Here's a quick look at how you can use it:
```python ```python
from litellm import completion_with_retries from litellm import completion
user_message = "Hello, whats the weather in San Francisco??" user_message = "Hello, whats the weather in San Francisco??"
messages = [{"content": user_message, "role": "user"}] messages = [{"content": user_message, "role": "user"}]
# normal call # normal call
def test_completion_custom_provider_model_name(): response = completion(
try:
response = completion_with_retries(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=messages, messages=messages,
num_retries=2
) )
# Add any assertions here to check the response
print(response)
except Exception as e:
printf"Error occurred: {e}")
``` ```
## Switch Models/API Keys/API Bases ## Fallbacks
### Context Window Fallbacks
```python
from litellm import completion
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
```
### Fallbacks - Switch Models/API Keys/API Bases
LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls
### Usage #### Usage
To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter.
The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response. The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
@ -76,6 +67,11 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}]) fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
``` ```
[Check out this section for implementation details](#fallbacks-1)
## Implementation Details
### Fallbacks
#### Output from calls #### Output from calls
``` ```
Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model' Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
@ -112,7 +108,7 @@ completion call gpt-3.5-turbo
When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable. When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.
### Key components of Model Fallbacks implementation: #### Key components of Model Fallbacks implementation:
* Looping through `fallbacks` * Looping through `fallbacks`
* Cool-Downs for rate-limited models * Cool-Downs for rate-limited models

View file

@ -1,17 +1,78 @@
# Reliability - Fallbacks, Multiple Deployments # Reliability - Fallbacks, Azure Deployments, etc.
## Model Fallbacks # Reliability
Never fail a request using LiteLLM, LiteLLM allows you to define fallback models for completion requests
```python LiteLLM helps prevent failed requests in 3 ways:
- Retries
- Fallbacks: Context Window + General
- RateLimitManager
## Helper utils
LiteLLM supports the following functions for reliability:
* `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents
* `num_retries`: use tenacity retries
* `completion()` with fallbacks: switch between models/keys/api bases in case of errors.
* `router()`: An abstraction on top of completion + embeddings to route the request to a deployment with capacity (available tpm/rpm).
## Retry failed requests
Call it in completion like this `completion(..num_retries=2)`.
Here's a quick look at how you can use it:
```python
from litellm import completion from litellm import completion
# if gpt-4 fails, retry the request with gpt-3.5-turbo->command-nightly->claude-instant-1
response = completion(model="gpt-4",messages=messages, fallbacks=["gpt-3.5-turbo" "command-nightly", "claude-instant-1"])
# if azure/gpt-4 fails, retry the request with fallback api_keys/api_base user_message = "Hello, whats the weather in San Francisco??"
response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}]) messages = [{"content": user_message, "role": "user"}]
# normal call
response = completion(
model="gpt-3.5-turbo",
messages=messages,
num_retries=2
)
``` ```
## Fallbacks
### Context Window Fallbacks
```python
from litellm import completion
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
```
### Fallbacks - Switch Models/API Keys/API Bases
LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls
#### Usage
To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter.
The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
#### switch models
```python
response = completion(model="bad-model", messages=messages,
fallbacks=["gpt-3.5-turbo" "command-nightly"])
```
#### switch api keys/bases (E.g. azure deployment)
Switch between different keys for the same azure deployment, or use another deployment as well.
```python
api_key="bad-key"
response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
```
[Check out this section for implementation details](#fallbacks-1)
## Manage Multiple Deployments ## Manage Multiple Deployments
Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI).
@ -109,4 +170,131 @@ curl 'http://0.0.0.0:8000/router/completions' \
"model": "gpt-3.5-turbo", "model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey"}] "messages": [{"role": "user", "content": "Hey"}]
}' }'
```
## Implementation Details
### Fallbacks
#### Output from calls
```
Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
completion call gpt-3.5-turbo
{
"id": "chatcmpl-7qTmVRuO3m3gIBg4aTmAumV1TmQhB",
"object": "chat.completion",
"created": 1692741891,
"model": "gpt-3.5-turbo-0613",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "I apologize, but as an AI, I do not have the capability to provide real-time weather updates. However, you can easily check the current weather in San Francisco by using a search engine or checking a weather website or app."
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 16,
"completion_tokens": 46,
"total_tokens": 62
}
}
```
#### How does fallbacks work
When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.
#### Key components of Model Fallbacks implementation:
* Looping through `fallbacks`
* Cool-Downs for rate-limited models
#### Looping through `fallbacks`
Allow `45seconds` for each request. In the 45s this function tries calling the primary model set as `model`. If model fails it loops through the backup `fallbacks` models and attempts to get a response in the allocated `45s` time set here:
```python
while response == None and time.time() - start_time < 45:
for model in fallbacks:
```
#### Cool-Downs for rate-limited models
If a model API call leads to an error - allow it to cooldown for `60s`
```python
except Exception as e:
print(f"got exception {e} for model {model}")
rate_limited_models.add(model)
model_expiration_times[model] = (
time.time() + 60
) # cool down this selected model
pass
```
Before making an LLM API call we check if the selected model is in `rate_limited_models`, if so skip making the API call
```python
if (
model in rate_limited_models
): # check if model is currently cooling down
if (
model_expiration_times.get(model)
and time.time() >= model_expiration_times[model]
):
rate_limited_models.remove(
model
) # check if it's been 60s of cool down and remove model
else:
continue # skip model
```
#### Full code of completion with fallbacks()
```python
response = None
rate_limited_models = set()
model_expiration_times = {}
start_time = time.time()
fallbacks = [kwargs["model"]] + kwargs["fallbacks"]
del kwargs["fallbacks"] # remove fallbacks so it's not recursive
while response == None and time.time() - start_time < 45:
for model in fallbacks:
# loop thru all models
try:
if (
model in rate_limited_models
): # check if model is currently cooling down
if (
model_expiration_times.get(model)
and time.time() >= model_expiration_times[model]
):
rate_limited_models.remove(
model
) # check if it's been 60s of cool down and remove model
else:
continue # skip model
# delete model from kwargs if it exists
if kwargs.get("model"):
del kwargs["model"]
print("making completion call", model)
response = litellm.completion(**kwargs, model=model)
if response != None:
return response
except Exception as e:
print(f"got exception {e} for model {model}")
rate_limited_models.add(model)
model_expiration_times[model] = (
time.time() + 60
) # cool down this selected model
pass
return response
``` ```

View file

@ -36,7 +36,6 @@ const sidebars = {
"completion/message_trimming", "completion/message_trimming",
"completion/function_call", "completion/function_call",
"completion/model_alias", "completion/model_alias",
"completion/reliable_completions",
"completion/config", "completion/config",
"completion/batching", "completion/batching",
"completion/mock_requests", "completion/mock_requests",

View file

@ -1149,7 +1149,7 @@ def test_completion_with_fallbacks():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# test_completion_with_fallbacks() test_completion_with_fallbacks()
def test_completion_anyscale_api(): def test_completion_anyscale_api():
try: try:
# litellm.set_verbose=True # litellm.set_verbose=True