docs(routing.md): add pre-call checks to docs

2025-04-26 19:24:27 +00:00 · 2024-03-23 19:10:34 -07:00 · 2024-03-23 19:10:34 -07:00 · 8c6402b02d
commit 8c6402b02d
parent 292cdd81e4
3 changed files with 94 additions and 4 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -551,6 +551,94 @@ router = Router(model_list: Optional[list] = None,
 				 cache_responses=True)
 ```
 ## Pre-Call Checks (Context Window)
 Enable pre-call checks to filter out deployments with context window limit < messages for a call.
 **1. Enable pre-call checks**
 ```python 
 from litellm import Router 
 # ...
 router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
 ```
 **2. (Azure-only) Set base model**
 For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
 ```python
 model_list = [
            {
                "model_name": "gpt-3.5-turbo", # model group name
                "litellm_params": {  # params for litellm completion/embedding call
                    "model": "azure/chatgpt-v-2",
                    "api_key": os.getenv("AZURE_API_KEY"),
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "api_base": os.getenv("AZURE_API_BASE"),
                },
 				"model_info": {
 					"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
 				}
            },
            {
                "model_name": "gpt-3.5-turbo", # model group name
                "litellm_params": {  # params for litellm completion/embedding call
                    "model": "gpt-3.5-turbo-1106",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            },
        ]
 ```
 **3. Test it!**
 ```python
 """
 - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
 - Send a 5k prompt
 - Assert it works
 """
 from litellm import Router
 import os
 try:
 model_list = [
 	{
 		"model_name": "gpt-3.5-turbo",  # model group name
 		"litellm_params": {  # params for litellm completion/embedding call
 			"model": "azure/chatgpt-v-2",
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE"),
 		},
 		"model_info": {
 			"base_model": "azure/gpt-35-turbo", 
 		}
 	},
 	{
 		"model_name": "gpt-3.5-turbo",  # model group name
 		"litellm_params": {  # params for litellm completion/embedding call
 			"model": "gpt-3.5-turbo-1106",
 			"api_key": os.getenv("OPENAI_API_KEY"),
 		},
 	},
 ]
 router = Router(model_list=model_list, enable_pre_call_checks=True) 
 text = "What is the meaning of 42?" * 5000
 response = router.completion(
 	model="gpt-3.5-turbo",
 	messages=[
 		{"role": "system", "content": text},
 		{"role": "user", "content": "Who was Alexander?"},
 	],
 )
 print(f"response: {response}")
 ```
 ## Caching across model groups
 If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2181,9 +2181,11 @@ class Router:
        for idx, deployment in enumerate(_returned_deployments):
            # see if we have the info for this model
            try:
-                base_model = deployment.get("litellm_params", {}).get(
+                base_model = deployment.get("model_info", {}).get("base_model", None)
-                    "base_model", None
+                if base_model is None:
-                )
+                    base_model = deployment.get("litellm_params", {}).get(
                        "base_model", None
                    )
                model = base_model or deployment.get("litellm_params", {}).get(
                    "model", None
                )
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -301,7 +301,7 @@ def test_router_azure_acompletion():
 def test_router_context_window_check():
    """
    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
-    - Send a 10k prompt
+    - Send a 5k prompt
    - Assert it works
    """
    from large_text import text