diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 9735b539e..0022794c8 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -551,6 +551,94 @@ router = Router(model_list: Optional[list] = None, cache_responses=True) ``` +## Pre-Call Checks (Context Window) + +Enable pre-call checks to filter out deployments with context window limit < messages for a call. + +**1. Enable pre-call checks** +```python +from litellm import Router +# ... +router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True +``` + +**2. (Azure-only) Set base model** + +For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. + +```python +model_list = [ + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + "model_info": { + "base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL + } + }, + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo-1106", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + }, + ] +``` + +**3. Test it!** + +```python +""" +- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k) +- Send a 5k prompt +- Assert it works +""" +from litellm import Router +import os + +try: +model_list = [ + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + "model_info": { + "base_model": "azure/gpt-35-turbo", + } + }, + { + "model_name": "gpt-3.5-turbo", # model group name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo-1106", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + }, +] + +router = Router(model_list=model_list, enable_pre_call_checks=True) + +text = "What is the meaning of 42?" * 5000 + +response = router.completion( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": text}, + {"role": "user", "content": "Who was Alexander?"}, + ], +) + +print(f"response: {response}") +``` + ## Caching across model groups If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. diff --git a/litellm/router.py b/litellm/router.py index 56a4894bf..b39b67a09 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -2181,9 +2181,11 @@ class Router: for idx, deployment in enumerate(_returned_deployments): # see if we have the info for this model try: - base_model = deployment.get("litellm_params", {}).get( - "base_model", None - ) + base_model = deployment.get("model_info", {}).get("base_model", None) + if base_model is None: + base_model = deployment.get("litellm_params", {}).get( + "base_model", None + ) model = base_model or deployment.get("litellm_params", {}).get( "model", None ) diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py index 40fa52b32..82580236a 100644 --- a/litellm/tests/test_router.py +++ b/litellm/tests/test_router.py @@ -301,7 +301,7 @@ def test_router_azure_acompletion(): def test_router_context_window_check(): """ - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k) - - Send a 10k prompt + - Send a 5k prompt - Assert it works """ from large_text import text