diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index bd04216dd..e39a6765f 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -151,7 +151,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ }' ``` -## Advanced - Context Window Fallbacks +## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks) **Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. @@ -232,16 +232,16 @@ model_list: - model_name: gpt-3.5-turbo-small litellm_params: model: azure/chatgpt-v-2 - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY - api_version: "2023-07-01-preview" - model_info: - base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + model_info: + base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL - model_name: gpt-3.5-turbo-large litellm_params: - model: gpt-3.5-turbo-1106 - api_key: os.environ/OPENAI_API_KEY + model: gpt-3.5-turbo-1106 + api_key: os.environ/OPENAI_API_KEY - model_name: claude-opus litellm_params: @@ -287,6 +287,69 @@ print(response) +## Advanced - EU-Region Filtering (Pre-Call Checks) + +**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. + +Set 'region_name' of deployment. + +**Note:** LiteLLM can automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`. + +**1. Set Config** + +```yaml +router_settings: + enable_pre_call_checks: true # 1. Enable pre-call checks + +model_list: +- model_name: gpt-3.5-turbo + litellm_params: + model: azure/chatgpt-v-2 + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + region_name: "eu" # 👈 SET EU-REGION + +- model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo-1106 + api_key: os.environ/OPENAI_API_KEY + +- model_name: gemini-pro + litellm_params: + model: vertex_ai/gemini-pro-1.5 + vertex_project: adroit-crow-1234 + vertex_location: us-east1 # 👈 AUTOMATICALLY INFERS 'region_name' +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +**3. Test it!** + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.with_raw_response.create( + model="gpt-3.5-turbo", + messages = [{"role": "user", "content": "Who was Alexander?"}] +) + +print(response) + +print(f"response.headers.get('x-litellm-model-api-base')") +``` + ## Advanced - Custom Timeouts, Stream Timeouts - Per Model For each model you can set `timeout` & `stream_timeout` under `litellm_params` ```yaml diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 0b0c7713c..b1afad2fb 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -879,13 +879,11 @@ router = Router(model_list: Optional[list] = None, cache_responses=True) ``` -## Pre-Call Checks (Context Window) +## Pre-Call Checks (Context Window, EU-Regions) Enable pre-call checks to filter out: 1. deployments with context window limit < messages for a call. -2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[ - router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages - ])`) +2. deployments outside of eu-region @@ -900,10 +898,14 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t **2. Set Model List** -For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. +For context window checks on azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. - - +For 'eu-region' filtering, Set 'region_name' of deployment. + +**Note:** We automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`. + + +[**See Code**](https://github.com/BerriAI/litellm/blob/d33e49411d6503cb634f9652873160cd534dec96/litellm/router.py#L2958) ```python model_list = [ @@ -914,10 +916,9 @@ model_list = [ "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), - }, - "model_info": { + "region_name": "eu" # 👈 SET 'EU' REGION NAME "base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL - } + }, }, { "model_name": "gpt-3.5-turbo", # model group name @@ -926,54 +927,26 @@ model_list = [ "api_key": os.getenv("OPENAI_API_KEY"), }, }, + { + "model_name": "gemini-pro", + "litellm_params: { + "model": "vertex_ai/gemini-pro-1.5", + "vertex_project": "adroit-crow-1234", + "vertex_location": "us-east1" # 👈 AUTOMATICALLY INFERS 'region_name' + } + } ] router = Router(model_list=model_list, enable_pre_call_checks=True) ``` - - - - -```python -model_list = [ - { - "model_name": "gpt-3.5-turbo-small", # model group name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - }, - "model_info": { - "base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL - } - }, - { - "model_name": "gpt-3.5-turbo-large", # model group name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo-1106", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - }, - { - "model_name": "claude-opus", - "litellm_params": { call - "model": "claude-3-opus-20240229", - "api_key": os.getenv("ANTHROPIC_API_KEY"), - }, - }, - ] - -router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) -``` - - - - **3. Test it!** + + + + ```python """ - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k) @@ -983,7 +956,6 @@ router = Router(model_list=model_list, enable_pre_call_checks=True, context_wind from litellm import Router import os -try: model_list = [ { "model_name": "gpt-3.5-turbo", # model group name @@ -992,6 +964,7 @@ model_list = [ "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), + "base_model": "azure/gpt-35-turbo", }, "model_info": { "base_model": "azure/gpt-35-turbo", @@ -1021,6 +994,59 @@ response = router.completion( print(f"response: {response}") ``` + + +```python +""" +- Give 2 gpt-3.5-turbo deployments, in eu + non-eu regions +- Make a call +- Assert it picks the eu-region model +""" + +from litellm import Router +import os + +model_list = [ + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "region_name": "eu" + }, + "model_info": { + "id": "1" + } + }, + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo-1106", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + "model_info": { + "id": "2" + } + }, +] + +router = Router(model_list=model_list, enable_pre_call_checks=True) + +response = router.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Who was Alexander?"}], +) + +print(f"response: {response}") + +print(f"response id: {response._hidden_params['model_id']}") +``` + + + + :::info