diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index 082b82d4f..6f6cfb1e2 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -84,8 +84,11 @@ def completion( api_base: Optional[str] = None, api_version: Optional[str] = None, api_key: Optional[str] = None, + num_retries: Optional[int] = None, # set to retry a model if an APIError, TimeoutError, or ServiceUnavailableError occurs + context_window_fallback_dict: Optional[dict] = None, # mapping of model to use if call fails due to context window error fallbacks: Optional[list] = None, # pass in a list of api_base,keys, etc. metadata: Optional[dict] = None # additional call metadata, passed to logging integrations / custom callbacks + **kwargs, ) -> ModelResponse: @@ -143,10 +146,16 @@ def completion( - `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds) +#### litellm-specific params + - `api_base`: *string (optional)* - The api endpoint you want to call the model with - `api_version`: *string (optional)* - (Azure-specific) the api version for the call +- `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs + +- `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error + - `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails - `metadata`: *dict (optional)* - Any additional data you want to be logged when the call is made (sent to logging integrations, eg. promptlayer and accessible via custom callback function) diff --git a/docs/my-website/docs/completion/reliable_completions.md b/docs/my-website/docs/completion/reliable_completions.md index 2b340a5d0..eb64eaf91 100644 --- a/docs/my-website/docs/completion/reliable_completions.md +++ b/docs/my-website/docs/completion/reliable_completions.md @@ -1,62 +1,53 @@ # Reliability + +LiteLLM helps prevent failed requests in 2 ways: +- Retries +- Fallbacks: Context Window + General + ## Helper utils LiteLLM supports the following functions for reliability: * `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents -* `completion_with_retries`: use tenacity retries +* `num_retries`: use tenacity retries * `completion()` with fallbacks: switch between models/keys/api bases in case of errors. -## Context Window Errors - -```python -from litellm import longer_context_model_fallback_dict, ContextWindowExceededError - -sample_text = "how does a court case get to the Supreme Court?" * 1000 -messages = [{"content": user_message, "role": "user"}] -model = "gpt-3.5-turbo" -try: - # try the original model - response = completion(model=model, messages=messages) -# catch the context window error -except ContextWindowExceededError as e: - if model in longer_context_model_fallback_dict: - # switch to the equivalent larger model -> gpt.3.5-turbo-16k - new_model = longer_context_model_fallback_dict[model] - response = completion(new_model, messages) - -print(response) -``` - - ## Retry failed requests -You can use this as a drop-in replacement for the `completion()` function to use tenacity retries - by default we retry the call 3 times. +Call it in completion like this `completion(..num_retries=2)`. + Here's a quick look at how you can use it: ```python -from litellm import completion_with_retries +from litellm import completion user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] # normal call -def test_completion_custom_provider_model_name(): - try: - response = completion_with_retries( +response = completion( model="gpt-3.5-turbo", messages=messages, + num_retries=2 ) - # Add any assertions here to check the response - print(response) - except Exception as e: - printf"Error occurred: {e}") ``` -## Switch Models/API Keys/API Bases +## Fallbacks + +### Context Window Fallbacks +```python +from litellm import completion + +fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} +messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}] + +completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict) +``` + +### Fallbacks - Switch Models/API Keys/API Bases LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls -### Usage +#### Usage To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response. @@ -76,6 +67,11 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}]) ``` +[Check out this section for implementation details](#fallbacks-1) + +## Implementation Details + +### Fallbacks #### Output from calls ``` Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model' @@ -112,7 +108,7 @@ completion call gpt-3.5-turbo When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable. -### Key components of Model Fallbacks implementation: +#### Key components of Model Fallbacks implementation: * Looping through `fallbacks` * Cool-Downs for rate-limited models diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index d91009986..391b20b2f 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -1,17 +1,78 @@ -# Reliability - Fallbacks, Multiple Deployments +# Reliability - Fallbacks, Azure Deployments, etc. -## Model Fallbacks -Never fail a request using LiteLLM, LiteLLM allows you to define fallback models for completion requests -```python +# Reliability + +LiteLLM helps prevent failed requests in 3 ways: +- Retries +- Fallbacks: Context Window + General +- RateLimitManager + +## Helper utils +LiteLLM supports the following functions for reliability: +* `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents +* `num_retries`: use tenacity retries +* `completion()` with fallbacks: switch between models/keys/api bases in case of errors. +* `router()`: An abstraction on top of completion + embeddings to route the request to a deployment with capacity (available tpm/rpm). + +## Retry failed requests + +Call it in completion like this `completion(..num_retries=2)`. + + +Here's a quick look at how you can use it: + +```python from litellm import completion -# if gpt-4 fails, retry the request with gpt-3.5-turbo->command-nightly->claude-instant-1 -response = completion(model="gpt-4",messages=messages, fallbacks=["gpt-3.5-turbo" "command-nightly", "claude-instant-1"]) -# if azure/gpt-4 fails, retry the request with fallback api_keys/api_base -response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}]) +user_message = "Hello, whats the weather in San Francisco??" +messages = [{"content": user_message, "role": "user"}] +# normal call +response = completion( + model="gpt-3.5-turbo", + messages=messages, + num_retries=2 + ) ``` +## Fallbacks + +### Context Window Fallbacks +```python +from litellm import completion + +fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} +messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}] + +completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict) +``` + +### Fallbacks - Switch Models/API Keys/API Bases + +LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls + +#### Usage +To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. + +The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response. + +#### switch models +```python +response = completion(model="bad-model", messages=messages, + fallbacks=["gpt-3.5-turbo" "command-nightly"]) +``` + +#### switch api keys/bases (E.g. azure deployment) +Switch between different keys for the same azure deployment, or use another deployment as well. + +```python +api_key="bad-key" +response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, + fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}]) +``` + +[Check out this section for implementation details](#fallbacks-1) + ## Manage Multiple Deployments Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). @@ -109,4 +170,131 @@ curl 'http://0.0.0.0:8000/router/completions' \ "model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey"}] }' +``` + + +## Implementation Details + +### Fallbacks +#### Output from calls +``` +Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model' + + + +completion call gpt-3.5-turbo +{ + "id": "chatcmpl-7qTmVRuO3m3gIBg4aTmAumV1TmQhB", + "object": "chat.completion", + "created": 1692741891, + "model": "gpt-3.5-turbo-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "I apologize, but as an AI, I do not have the capability to provide real-time weather updates. However, you can easily check the current weather in San Francisco by using a search engine or checking a weather website or app." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 16, + "completion_tokens": 46, + "total_tokens": 62 + } +} + +``` + +#### How does fallbacks work + +When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable. + + +#### Key components of Model Fallbacks implementation: +* Looping through `fallbacks` +* Cool-Downs for rate-limited models + +#### Looping through `fallbacks` +Allow `45seconds` for each request. In the 45s this function tries calling the primary model set as `model`. If model fails it loops through the backup `fallbacks` models and attempts to get a response in the allocated `45s` time set here: +```python +while response == None and time.time() - start_time < 45: + for model in fallbacks: +``` + +#### Cool-Downs for rate-limited models +If a model API call leads to an error - allow it to cooldown for `60s` +```python +except Exception as e: + print(f"got exception {e} for model {model}") + rate_limited_models.add(model) + model_expiration_times[model] = ( + time.time() + 60 + ) # cool down this selected model + pass +``` + +Before making an LLM API call we check if the selected model is in `rate_limited_models`, if so skip making the API call +```python +if ( + model in rate_limited_models +): # check if model is currently cooling down + if ( + model_expiration_times.get(model) + and time.time() >= model_expiration_times[model] + ): + rate_limited_models.remove( + model + ) # check if it's been 60s of cool down and remove model + else: + continue # skip model + +``` + +#### Full code of completion with fallbacks() +```python + + response = None + rate_limited_models = set() + model_expiration_times = {} + start_time = time.time() + fallbacks = [kwargs["model"]] + kwargs["fallbacks"] + del kwargs["fallbacks"] # remove fallbacks so it's not recursive + + while response == None and time.time() - start_time < 45: + for model in fallbacks: + # loop thru all models + try: + if ( + model in rate_limited_models + ): # check if model is currently cooling down + if ( + model_expiration_times.get(model) + and time.time() >= model_expiration_times[model] + ): + rate_limited_models.remove( + model + ) # check if it's been 60s of cool down and remove model + else: + continue # skip model + + # delete model from kwargs if it exists + if kwargs.get("model"): + del kwargs["model"] + + print("making completion call", model) + response = litellm.completion(**kwargs, model=model) + + if response != None: + return response + + except Exception as e: + print(f"got exception {e} for model {model}") + rate_limited_models.add(model) + model_expiration_times[model] = ( + time.time() + 60 + ) # cool down this selected model + pass + return response ``` \ No newline at end of file diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 5fb55c6b5..c5404f2c4 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -36,7 +36,6 @@ const sidebars = { "completion/message_trimming", "completion/function_call", "completion/model_alias", - "completion/reliable_completions", "completion/config", "completion/batching", "completion/mock_requests", diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 5c2b96704..3fbaa79d9 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1149,7 +1149,7 @@ def test_completion_with_fallbacks(): except Exception as e: pytest.fail(f"Error occurred: {e}") -# test_completion_with_fallbacks() +test_completion_with_fallbacks() def test_completion_anyscale_api(): try: # litellm.set_verbose=True