docs(routing.md): add pre-call checks to docs

2024-03-23 19:10:34 -07:00 · 2024-03-23 19:10:34 -07:00 · e8e7964025
commit e8e7964025
parent b7321ae4ee
3 changed files with 94 additions and 4 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -551,6 +551,94 @@ router = Router(model_list: Optional[list] = None,
 				 cache_responses=True)
 ```

+## Pre-Call Checks (Context Window)
+
+Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+
+**1. Enable pre-call checks**
+```python 
+from litellm import Router 
+# ...
+router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
+```
+
+**2. (Azure-only) Set base model**
+
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
+
+```python
+model_list = [
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+				"model_info": {
+					"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
+				}
+            },
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+```
+
+**3. Test it!**
+
+```python
+"""
+- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+- Send a 5k prompt
+- Assert it works
+"""
+from litellm import Router
+import os
+
+try:
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
+			"api_key": os.getenv("AZURE_API_KEY"),
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE"),
+		},
+		"model_info": {
+			"base_model": "azure/gpt-35-turbo", 
+		}
+	},
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo-1106",
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+	},
+]
+
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
+
+text = "What is the meaning of 42?" * 5000
+
+response = router.completion(
+	model="gpt-3.5-turbo",
+	messages=[
+		{"role": "system", "content": text},
+		{"role": "user", "content": "Who was Alexander?"},
+	],
+)
+
+print(f"response: {response}")
+```
+
 ## Caching across model groups

 If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2181,6 +2181,8 @@ class Router:
        for idx, deployment in enumerate(_returned_deployments):
            # see if we have the info for this model
            try:
+                base_model = deployment.get("model_info", {}).get("base_model", None)
+                if base_model is None:
                    base_model = deployment.get("litellm_params", {}).get(
                        "base_model", None
                    )
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -301,7 +301,7 @@ def test_router_azure_acompletion():
 def test_router_context_window_check():
    """
    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
-    - Send a 10k prompt
+    - Send a 5k prompt
    - Assert it works
    """
    from large_text import text