diff --git a/docs/my-website/docs/proxy/customer_routing.md b/docs/my-website/docs/proxy/customer_routing.md index cf4105c2f2..9bba5e7235 100644 --- a/docs/my-website/docs/proxy/customer_routing.md +++ b/docs/my-website/docs/proxy/customer_routing.md @@ -1,4 +1,11 @@ -# Region-based Routing +# [DEPRECATED] Region-based Routing + +:::info + +This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead + +::: + Route specific customers to eu-only models. diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md index dc57240666..fd95b57c1b 100644 --- a/docs/my-website/docs/proxy/load_balancing.md +++ b/docs/my-website/docs/proxy/load_balancing.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Proxy - Load Balancing Load balance multiple instances of the same model @@ -10,6 +13,159 @@ For more details on routing strategies / params, see [Routing](../routing.md) ::: +## Quick Start - Load Balancing +#### Step 1 - Set deployments on config + +**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo` +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/ + api_base: + api_key: + rpm: 6 # Rate limit for this deployment: in requests per minute (rpm) + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-ca + api_base: https://my-endpoint-canada-berri992.openai.azure.com/ + api_key: + rpm: 6 + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-large + api_base: https://openai-france-1234.openai.azure.com/ + api_key: + rpm: 1440 + +router_settings: + routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo` + num_retries: 2 + timeout: 30 # 30 seconds + redis_host: # set this when using multiple litellm proxy deployments, load balancing state stored in redis + redis_password: + redis_port: 1992 +``` + +:::info +Detailed information about [routing strategies can be found here](../routing) +::: + +#### Step 2: Start Proxy with config + +```shell +$ litellm --config /path/to/config.yaml +``` + +### Test - Simple Call + +Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo + +👉 Key Change: `model="gpt-3.5-turbo"` + +**Check the `model_id` in Response Headers to make sure the requests are being load balanced** + + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ] +) + +print(response) +``` + + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] +}' +``` + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.schema import HumanMessage, SystemMessage +import os + +os.environ["OPENAI_API_KEY"] = "anything" + +chat = ChatOpenAI( + openai_api_base="http://0.0.0.0:4000", + model="gpt-3.5-turbo", +) + +messages = [ + SystemMessage( + content="You are a helpful assistant that im using to make a test request to." + ), + HumanMessage( + content="test from litellm. tell me why it's amazing in 1 sentence" + ), +] +response = chat(messages) + +print(response) +``` + + + + + + +### Test - Loadbalancing + +In this request, the following will occur: +1. A rate limit exception will be raised +2. LiteLLM proxy will retry the request on the model group (default is 3). + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "Hi there!"} + ], + "mock_testing_rate_limit_error": true +}' +``` + +[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535) + + ## Load Balancing using multiple litellm instances (Kubernetes, Auto Scaling) LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage ) diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index 9f1c1c8bb9..489f4e2ef1 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -2,15 +2,61 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Proxy - Fallbacks, Retries +# Fallbacks -- Quick Start [load balancing](#test---load-balancing) -- Quick Start [client side fallbacks](#test---client-side-fallbacks) +If a call fails after num_retries, fallback to another model group. + +- Quick Start [load balancing](./load_balancing.md) +- Quick Start [client side fallbacks](#client-side-fallbacks) + + +Fallbacks are typically done from one `model_name` to another `model_name`. + +## Quick Start + +### 1. Setup fallbacks + +Key change: + +```python +fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}] +``` + + + + +```python +from litellm import Router +router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "azure/", + "api_base": "", + "api_key": "", + "rpm": 6 + } + }, + { + "model_name": "gpt-4", + "litellm_params": { + "model": "azure/gpt-4-ca", + "api_base": "https://my-endpoint-canada-berri992.openai.azure.com/", + "api_key": "", + "rpm": 6 + } + } + ], + fallbacks=[{"gpt-3.5-turbo": ["gpt-4"]}] # 👈 KEY CHANGE +) + +``` + + + -## Quick Start - Load Balancing -#### Step 1 - Set deployments on config -**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo` ```yaml model_list: - model_name: gpt-3.5-turbo @@ -19,147 +65,93 @@ model_list: api_base: api_key: rpm: 6 # Rate limit for this deployment: in requests per minute (rpm) - - model_name: gpt-3.5-turbo + - model_name: gpt-4 litellm_params: - model: azure/gpt-turbo-small-ca + model: azure/gpt-4-ca api_base: https://my-endpoint-canada-berri992.openai.azure.com/ api_key: rpm: 6 - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/gpt-turbo-large - api_base: https://openai-france-1234.openai.azure.com/ - api_key: - rpm: 1440 router_settings: - routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo` - num_retries: 2 - timeout: 30 # 30 seconds - redis_host: # set this when using multiple litellm proxy deployments, load balancing state stored in redis - redis_password: - redis_port: 1992 + fallbacks: [{"gpt-3.5-turbo": ["gpt-4"]}] ``` -:::info -Detailed information about [routing strategies can be found here](../routing) -::: - -#### Step 2: Start Proxy with config - -```shell -$ litellm --config /path/to/config.yaml -``` - -### Test - Simple Call - -Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo - -👉 Key Change: `model="gpt-3.5-turbo"` - -**Check the `model_id` in Response Headers to make sure the requests are being load balanced** - - - - - -```python -import openai -client = openai.OpenAI( - api_key="anything", - base_url="http://0.0.0.0:4000" -) - -response = client.chat.completions.create( - model="gpt-3.5-turbo", - messages = [ - { - "role": "user", - "content": "this is a test request, write a short poem" - } - ] -) - -print(response) -``` - - - - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "gpt-3.5-turbo", - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ] -}' -``` - - - -```python -from langchain.chat_models import ChatOpenAI -from langchain.prompts.chat import ( - ChatPromptTemplate, - HumanMessagePromptTemplate, - SystemMessagePromptTemplate, -) -from langchain.schema import HumanMessage, SystemMessage -import os - -os.environ["OPENAI_API_KEY"] = "anything" - -chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:4000", - model="gpt-3.5-turbo", -) - -messages = [ - SystemMessage( - content="You are a helpful assistant that im using to make a test request to." - ), - HumanMessage( - content="test from litellm. tell me why it's amazing in 1 sentence" - ), -] -response = chat(messages) - -print(response) -``` - -### Test - Loadbalancing +### 2. Start Proxy -In this request, the following will occur: -1. A rate limit exception will be raised -2. LiteLLM proxy will retry the request on the model group (default is 3). +```bash +litellm --config /path/to/config.yaml +``` + +### 3. Test Fallbacks + +Pass `mock_testing_fallbacks=true` in request body, to trigger fallbacks. + + + + + +```python + +from litellm import Router + +model_list = [{..}, {..}] # defined in Step 1. + +router = Router(model_list=model_list, fallbacks=[{"bad-model": ["my-good-model"]}]) + +response = router.completion( + model="bad-model", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_testing_fallbacks=True, +) +``` + + + ```bash curl -X POST 'http://0.0.0.0:4000/chat/completions' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ --d '{ - "model": "gpt-3.5-turbo", +-D '{ + "model": "my-bad-model", "messages": [ - {"role": "user", "content": "Hi there!"} - ], - "mock_testing_rate_limit_error": true -}' + { + "role": "user", + "content": "ping" + } + ], + "mock_testing_fallbacks": true # 👈 KEY CHANGE +} +' ``` -[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535) + + + + + + +### Explanation + +Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc. + +You can also set [`default_fallbacks`](#default-fallbacks), in case a specific model group is misconfigured / bad. + +There are 3 types of fallbacks: +- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54) +- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469) +- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError + + +## Client Side Fallbacks + +Set fallbacks in the `.completion()` call for SDK and client-side for proxy. -### Test - Client Side Fallbacks In this request the following will occur: 1. The request to `model="zephyr-beta"` will fail 2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]` @@ -168,7 +160,32 @@ In this request the following will occur: 👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]` + +```python +from litellm import Router + +router = Router(model_list=[..]) # defined in Step 1. + +resp = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_testing_fallbacks=True, # 👈 trigger fallbacks + fallbacks=[ + { + "model": "claude-3-haiku", + "messages": [{"role": "user", "content": "What is LiteLLM?"}], + } + ], +) + +print(resp) +``` + + + + + ```python @@ -197,8 +214,6 @@ print(response) -Pass `metadata` as part of the request body - ```shell curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ @@ -252,24 +267,282 @@ print(response) + + +### Control Fallback Prompts - +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hi, how are you ?" + } + ] + } + ], + "fallbacks": [{ + "model": "claude-3-haiku", + "messages": [{"role": "user", "content": "What is LiteLLM?"}] + }], + "mock_testing_fallbacks": true +}' +``` + + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.schema import HumanMessage, SystemMessage +import os + +os.environ["OPENAI_API_KEY"] = "anything" + +chat = ChatOpenAI( + openai_api_base="http://0.0.0.0:4000", + model="zephyr-beta", + extra_body={ + "fallbacks": [{ + "model": "claude-3-haiku", + "messages": [{"role": "user", "content": "What is LiteLLM?"}] + }] + } +) + +messages = [ + SystemMessage( + content="You are a helpful assistant that im using to make a test request to." + ), + HumanMessage( + content="test from litellm. tell me why it's amazing in 1 sentence" + ), +] +response = chat(messages) + +print(response) +``` + + + + + + + + +## Content Policy Violation Fallback + +Key change: + +```python +content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] +``` + + + + +```python +from litellm import Router + +router = Router( + model_list=[ + { + "model_name": "claude-2", + "litellm_params": { + "model": "claude-2", + "api_key": "", + "mock_response": Exception("content filtering policy"), + }, + }, + { + "model_name": "my-fallback-model", + "litellm_params": { + "model": "claude-2", + "api_key": "", + "mock_response": "This works!", + }, + }, + ], + content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE + # fallbacks=[..], # [OPTIONAL] + # context_window_fallbacks=[..], # [OPTIONAL] +) + +response = router.completion( + model="claude-2", + messages=[{"role": "user", "content": "Hey, how's it going?"}], +) +``` + + + +In your proxy config.yaml just add this line 👇 + +```yaml +router_settings: + content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] +``` + +Start proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + + +## Context Window Exceeded Fallback + +Key change: + +```python +context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}] +``` + + + + +```python +from litellm import Router + +router = Router( + model_list=[ + { + "model_name": "claude-2", + "litellm_params": { + "model": "claude-2", + "api_key": "", + "mock_response": Exception("prompt is too long"), + }, + }, + { + "model_name": "my-fallback-model", + "litellm_params": { + "model": "claude-2", + "api_key": "", + "mock_response": "This works!", + }, + }, + ], + context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE + # fallbacks=[..], # [OPTIONAL] + # content_policy_fallbacks=[..], # [OPTIONAL] +) + +response = router.completion( + model="claude-2", + messages=[{"role": "user", "content": "Hey, how's it going?"}], +) +``` + + + +In your proxy config.yaml just add this line 👇 + +```yaml +router_settings: + context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}] +``` + +Start proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + ## Advanced ### Fallbacks + Retries + Timeouts + Cooldowns @@ -684,81 +957,6 @@ print(response) print(f"response.headers.get('x-litellm-model-api-base')") ``` -### Custom Timeouts, Stream Timeouts - Per Model -For each model you can set `timeout` & `stream_timeout` under `litellm_params` -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/gpt-turbo-small-eu - api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ - api_key: - timeout: 0.1 # timeout in (seconds) - stream_timeout: 0.01 # timeout for stream requests (seconds) - max_retries: 5 - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/gpt-turbo-small-ca - api_base: https://my-endpoint-canada-berri992.openai.azure.com/ - api_key: - timeout: 0.1 # timeout in (seconds) - stream_timeout: 0.01 # timeout for stream requests (seconds) - max_retries: 5 - -``` - -#### Start Proxy -```shell -$ litellm --config /path/to/config.yaml -``` - - -### Setting Dynamic Timeouts - Per Request - -LiteLLM Proxy supports setting a `timeout` per request - -**Example Usage** - - - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "model": "gpt-3.5-turbo", - "messages": [ - {"role": "user", "content": "what color is red"} - ], - "logit_bias": {12481: 100}, - "timeout": 1 - }' -``` - - - -```python -import openai - - -client = openai.OpenAI( - api_key="anything", - base_url="http://0.0.0.0:4000" -) - -response = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "user", "content": "what color is red"} - ], - logit_bias={12481: 100}, - timeout=1 -) - -print(response) -``` - - - ### Setting Fallbacks for Wildcard Models You can set fallbacks for wildcard models (e.g. `azure/*`) in your config file. diff --git a/docs/my-website/docs/proxy/team_based_routing.md b/docs/my-website/docs/proxy/team_based_routing.md index bda286f4ad..4230134dd6 100644 --- a/docs/my-website/docs/proxy/team_based_routing.md +++ b/docs/my-website/docs/proxy/team_based_routing.md @@ -1,4 +1,11 @@ -# Team-based Routing +# [DEPRECATED] Team-based Routing + +:::info + +This is deprecated, please use [Tag Based Routing](./tag_routing.md) instead + +::: + ## Routing Route calls to different model groups based on the team-id diff --git a/docs/my-website/docs/proxy/timeout.md b/docs/my-website/docs/proxy/timeout.md new file mode 100644 index 0000000000..bf9a35af0e --- /dev/null +++ b/docs/my-website/docs/proxy/timeout.md @@ -0,0 +1,178 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Timeouts + +The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. + +### Global Timeouts + + + + +```python +from litellm import Router + +model_list = [{...}] + +router = Router(model_list=model_list, + timeout=30) # raise timeout error if call takes > 30s + +print(response) +``` + + + + +```yaml +router_settings: + timeout: 30 # sets a 30s timeout for the entire call +``` + +**Start Proxy** + +```shell +$ litellm --config /path/to/config.yaml +``` + + + + +### Custom Timeouts, Stream Timeouts - Per Model +For each model you can set `timeout` & `stream_timeout` under `litellm_params` + + + + +```python +from litellm import Router +import asyncio + +model_list = [{ + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "timeout": 300 # sets a 5 minute timeout + "stream_timeout": 30 # sets a 30s timeout for streaming calls + } +}] + +# init router +router = Router(model_list=model_list, routing_strategy="least-busy") +async def router_acompletion(): + response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + ) + print(response) + return response + +asyncio.run(router_acompletion()) +``` + + + + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ + api_key: + timeout: 0.1 # timeout in (seconds) + stream_timeout: 0.01 # timeout for stream requests (seconds) + max_retries: 5 + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-ca + api_base: https://my-endpoint-canada-berri992.openai.azure.com/ + api_key: + timeout: 0.1 # timeout in (seconds) + stream_timeout: 0.01 # timeout for stream requests (seconds) + max_retries: 5 + +``` + + +**Start Proxy** + +```shell +$ litellm --config /path/to/config.yaml +``` + + + + + + +### Setting Dynamic Timeouts - Per Request + +LiteLLM supports setting a `timeout` per request + +**Example Usage** + + + +```python +from litellm import Router + +model_list = [{...}] +router = Router(model_list=model_list) + +response = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "what color is red"}], + timeout=1 +) +``` + + + + + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "what color is red"} + ], + "logit_bias": {12481: 100}, + "timeout": 1 + }' +``` + + + +```python +import openai + + +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "what color is red"} + ], + logit_bias={12481: 100}, + timeout=1 +) + +print(response) +``` + + + + + \ No newline at end of file diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index c4b633a976..308b850e45 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -3,7 +3,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Router - Load Balancing, Fallbacks +# Router - Load Balancing LiteLLM manages: - Load-balance across multiple deployments (e.g. Azure/OpenAI) @@ -855,52 +855,6 @@ router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 [**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605) -### Timeouts - -The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. - -**Global Timeouts** -```python -from litellm import Router - -model_list = [{...}] - -router = Router(model_list=model_list, - timeout=30) # raise timeout error if call takes > 30s - -print(response) -``` - -**Timeouts per model** - -```python -from litellm import Router -import asyncio - -model_list = [{ - "model_name": "gpt-3.5-turbo", - "litellm_params": { - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - "timeout": 300 # sets a 5 minute timeout - "stream_timeout": 30 # sets a 30s timeout for streaming calls - } -}] - -# init router -router = Router(model_list=model_list, routing_strategy="least-busy") -async def router_acompletion(): - response = await router.acompletion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hey, how's it going?"}] - ) - print(response) - return response - -asyncio.run(router_acompletion()) -``` ### Cooldowns Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. @@ -1125,248 +1079,6 @@ router_settings: - -### Fallbacks - -If a call fails after num_retries, fall back to another model group. - -#### Quick Start - -```python -from litellm import Router -router = Router( - model_list=[ - { # bad model - "model_name": "bad-model", - "litellm_params": { - "model": "openai/my-bad-model", - "api_key": "my-bad-api-key", - "mock_response": "Bad call" - }, - }, - { # good model - "model_name": "my-good-model", - "litellm_params": { - "model": "gpt-4o", - "api_key": os.getenv("OPENAI_API_KEY"), - "mock_response": "Good call" - }, - }, - ], - fallbacks=[{"bad-model": ["my-good-model"]}] # 👈 KEY CHANGE -) - -response = router.completion( - model="bad-model", - messages=[{"role": "user", "content": "Hey, how's it going?"}], - mock_testing_fallbacks=True, -) -``` - -If the error is a context window exceeded error, fall back to a larger model group (if given). - -Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc. - -You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad. - -There are 3 types of fallbacks: -- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54) -- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469) -- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError - -**Content Policy Violation Fallback** - -Key change: - -```python -content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - - - - -```python -from litellm import Router - -router = Router( - model_list=[ - { - "model_name": "claude-2", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": Exception("content filtering policy"), - }, - }, - { - "model_name": "my-fallback-model", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": "This works!", - }, - }, - ], - content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE - # fallbacks=[..], # [OPTIONAL] - # context_window_fallbacks=[..], # [OPTIONAL] -) - -response = router.completion( - model="claude-2", - messages=[{"role": "user", "content": "Hey, how's it going?"}], -) -``` - - - -In your proxy config.yaml just add this line 👇 - -```yaml -router_settings: - content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - -Start proxy - -```bash -litellm --config /path/to/config.yaml - -# RUNNING on http://0.0.0.0:4000 -``` - - - - -**Context Window Exceeded Fallback** - -Key change: - -```python -context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - - - - -```python -from litellm import Router - -router = Router( - model_list=[ - { - "model_name": "claude-2", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": Exception("prompt is too long"), - }, - }, - { - "model_name": "my-fallback-model", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": "This works!", - }, - }, - ], - context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE - # fallbacks=[..], # [OPTIONAL] - # content_policy_fallbacks=[..], # [OPTIONAL] -) - -response = router.completion( - model="claude-2", - messages=[{"role": "user", "content": "Hey, how's it going?"}], -) -``` - - - -In your proxy config.yaml just add this line 👇 - -```yaml -router_settings: - context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - -Start proxy - -```bash -litellm --config /path/to/config.yaml - -# RUNNING on http://0.0.0.0:4000 -``` - - - - -**Regular Fallbacks** - -Key change: - -```python -fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - - - - -```python -from litellm import Router - -router = Router( - model_list=[ - { - "model_name": "claude-2", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": Exception("this is a rate limit error"), - }, - }, - { - "model_name": "my-fallback-model", - "litellm_params": { - "model": "claude-2", - "api_key": "", - "mock_response": "This works!", - }, - }, - ], - fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE - # context_window_fallbacks=[..], # [OPTIONAL] - # content_policy_fallbacks=[..], # [OPTIONAL] -) - -response = router.completion( - model="claude-2", - messages=[{"role": "user", "content": "Hey, how's it going?"}], -) -``` - - - -In your proxy config.yaml just add this line 👇 - -```yaml -router_settings: - fallbacks=[{"claude-2": ["my-fallback-model"]}] -``` - -Start proxy - -```bash -litellm --config /path/to/config.yaml - -# RUNNING on http://0.0.0.0:4000 -``` - - - - - ### Caching In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. @@ -1808,48 +1520,6 @@ response = router.completion( If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model) -## Init Params for the litellm.Router - -```python -def __init__( - model_list: Optional[list] = None, - - ## CACHING ## - redis_url: Optional[str] = None, - redis_host: Optional[str] = None, - redis_port: Optional[int] = None, - redis_password: Optional[str] = None, - cache_responses: Optional[bool] = False, - cache_kwargs: dict = {}, # additional kwargs to pass to RedisCache (see caching.py) - caching_groups: Optional[ - List[tuple] - ] = None, # if you want to cache across model groups - client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds - - ## RELIABILITY ## - num_retries: int = 0, - timeout: Optional[float] = None, - default_litellm_params={}, # default params for Router.chat.completion.create - fallbacks: Optional[List] = None, - default_fallbacks: Optional[List] = None - allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown - cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure - context_window_fallbacks: Optional[List] = None, - model_group_alias: Optional[dict] = {}, - retry_after: int = 0, # (min) time to wait before retrying a failed request - routing_strategy: Literal[ - "simple-shuffle", - "least-busy", - "usage-based-routing", - "latency-based-routing", - "cost-based-routing", - ] = "simple-shuffle", - - ## DEBUGGING ## - set_verbose: bool = False, # set this to True for seeing logs - debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging -): -``` ## Debugging Router ### Basic Debugging diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 34701fa324..e4547a4634 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -290,7 +290,7 @@ const sidebars = { description: "Learn how to load balance, route, and set fallbacks for your LLM requests", slug: "/routing-load-balancing", }, - items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"], + items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/timeout", "proxy/tag_routing", "proxy/provider_budget_routing", "wildcard_routing"], }, { type: "category", @@ -395,6 +395,8 @@ const sidebars = { "proxy/pii_masking", "extras/code_quality", "rules", + "proxy/team_based_routing", + "proxy/customer_routing", "proxy_server", { type: "category", diff --git a/litellm/router.py b/litellm/router.py index 541135a882..a0337f2bb7 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -69,6 +69,7 @@ from litellm.router_utils.cooldown_handlers import ( _set_cooldown_deployments, ) from litellm.router_utils.fallback_event_handlers import ( + _check_non_standard_fallback_format, get_fallback_model_group, run_async_fallback, ) @@ -2647,6 +2648,27 @@ class Router: try: verbose_router_logger.info("Trying to fallback b/w models") + + # check if client-side fallbacks are used (e.g. fallbacks = ["gpt-3.5-turbo", "claude-3-haiku"] or fallbacks=[{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}] + is_non_standard_fallback_format = _check_non_standard_fallback_format( + fallbacks=fallbacks + ) + + if is_non_standard_fallback_format: + input_kwargs.update( + { + "fallback_model_group": fallbacks, + "original_model_group": original_model_group, + } + ) + + response = await run_async_fallback( + *args, + **input_kwargs, + ) + + return response + if isinstance(e, litellm.ContextWindowExceededError): if context_window_fallbacks is not None: fallback_model_group: Optional[List[str]] = ( @@ -2722,7 +2744,7 @@ class Router: verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}") fallback_model_group, generic_fallback_idx = ( get_fallback_model_group( - fallbacks=fallbacks, + fallbacks=fallbacks, # if fallbacks = [{"gpt-3.5-turbo": ["claude-3-haiku"]}] model_group=cast(str, model_group), ) ) diff --git a/litellm/router_utils/fallback_event_handlers.py b/litellm/router_utils/fallback_event_handlers.py index 84c3d76285..5bc5b82e0a 100644 --- a/litellm/router_utils/fallback_event_handlers.py +++ b/litellm/router_utils/fallback_event_handlers.py @@ -1,9 +1,10 @@ from enum import Enum -from typing import TYPE_CHECKING, Any, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import litellm from litellm._logging import verbose_router_logger from litellm.integrations.custom_logger import CustomLogger +from litellm.types.router import LiteLLMParamsTypedDict if TYPE_CHECKING: from litellm.router import Router as _Router @@ -67,7 +68,7 @@ def get_fallback_model_group( elif list(item.keys())[0] == "*": # check generic fallback generic_fallback_idx = idx elif isinstance(item, str): - fallback_model_group = [fallbacks.pop(idx)] + fallback_model_group = [fallbacks.pop(idx)] # returns single-item list ## if none, check for generic fallback if fallback_model_group is None: if stripped_model_fallback is not None: @@ -122,9 +123,12 @@ async def run_async_fallback( # LOGGING kwargs = litellm_router.log_retry(kwargs=kwargs, e=original_exception) verbose_router_logger.info(f"Falling back to model_group = {mg}") - kwargs["model"] = mg + if isinstance(mg, str): + kwargs["model"] = mg + elif isinstance(mg, dict): + kwargs.update(mg) kwargs.setdefault("metadata", {}).update( - {"model_group": mg} + {"model_group": kwargs.get("model", None)} ) # update model_group used, if fallbacks are done kwargs["fallback_depth"] = fallback_depth + 1 kwargs["max_fallbacks"] = max_fallbacks @@ -310,3 +314,31 @@ async def log_failure_fallback_event( verbose_router_logger.error( f"Error in log_failure_fallback_event: {str(e)}" ) + + +def _check_non_standard_fallback_format(fallbacks: Optional[List[Any]]) -> bool: + """ + Checks if the fallbacks list is a list of strings or a list of dictionaries. + + If + - List[str]: e.g. ["claude-3-haiku", "openai/o-1"] + - List[Dict[, Any]]: e.g. [{"model": "claude-3-haiku", "messages": [{"role": "user", "content": "Hey, how's it going?"}]}] + + If [{"gpt-3.5-turbo": ["claude-3-haiku"]}] then standard format. + """ + if fallbacks is None or not isinstance(fallbacks, list) or len(fallbacks) == 0: + return False + if all(isinstance(item, str) for item in fallbacks): + return True + elif all(isinstance(item, dict) for item in fallbacks): + for key in LiteLLMParamsTypedDict.__annotations__.keys(): + if key in fallbacks[0].keys(): + return True + + return False + + +def run_non_standard_fallback_format( + fallbacks: Union[List[str], List[Dict[str, Any]]], model_group: str +): + pass diff --git a/tests/local_testing/test_audio_speech.py b/tests/local_testing/test_audio_speech.py index cef9d354d5..9e60add17a 100644 --- a/tests/local_testing/test_audio_speech.py +++ b/tests/local_testing/test_audio_speech.py @@ -281,6 +281,7 @@ async def test_speech_litellm_vertex_async_with_voice_ssml(): } +@pytest.mark.skip(reason="causes openai rate limit errors") def test_audio_speech_cost_calc(): from litellm.integrations.custom_logger import CustomLogger diff --git a/tests/local_testing/test_router_fallbacks.py b/tests/local_testing/test_router_fallbacks.py index 41b2b9c9cd..09bb32365f 100644 --- a/tests/local_testing/test_router_fallbacks.py +++ b/tests/local_testing/test_router_fallbacks.py @@ -1567,3 +1567,38 @@ def test_get_fallback_model_group(): } fallback_model_group, _ = get_fallback_model_group(**args) assert fallback_model_group == ["claude-3-haiku"] + + +def test_fallbacks_with_different_messages(): + router = Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + }, + { + "model_name": "claude-3-haiku", + "litellm_params": { + "model": "claude-3-haiku-20240307", + "api_key": os.getenv("ANTHROPIC_API_KEY"), + }, + }, + ], + ) + + resp = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_testing_fallbacks=True, + fallbacks=[ + { + "model": "claude-3-haiku", + "messages": [{"role": "user", "content": "Hey, how's it going?"}], + } + ], + ) + + print(resp) diff --git a/tests/local_testing/test_streaming.py b/tests/local_testing/test_streaming.py index 67a0400283..eaf62cd78b 100644 --- a/tests/local_testing/test_streaming.py +++ b/tests/local_testing/test_streaming.py @@ -3922,7 +3922,7 @@ def test_unit_test_perplexity_citations_chunk(): ], ) @pytest.mark.flaky(retries=3, delay=1) -def test_streaming_tool_calls_valid_json_str(model): +def test_aastreaming_tool_calls_valid_json_str(model): if "vertex_ai" in model: from test_amazing_vertex_completion import ( load_vertex_ai_credentials,