Merge remote-tracking branch 'upstream/main' into fix-pip-install-extra-proxy

2024-03-24 08:53:54 +02:00 · 2024-03-24 08:53:54 +02:00 · 4c2274125e
commit 4c2274125e
parent c807a21442 63f6a9deff
26 changed files with 3888 additions and 572 deletions
--- a/4
+++ b/4
@ -70,5 +70,5 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
+# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -72,5 +72,5 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000","--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--run_gunicorn"]
+# CMD ["--port", "4000", "--detailed_debug"]
+CMD ["--port", "4000"]
--- a/deploy/kubernetes/kub.yaml
+++ b/deploy/kubernetes/kub.yaml
@ -3,7 +3,7 @@ kind: Deployment
 metadata:
  name: litellm-deployment
 spec:
-  replicas: 5
+  replicas: 3
  selector:
    matchLabels:
      app: litellm
@ -17,13 +17,13 @@ spec:
          image: ghcr.io/berriai/litellm:main-latest
          env:
            - name: AZURE_API_KEY
-              value: "d699s"
+              value: "d6f****"
            - name: AZURE_API_BASE
-              value: "https://openai/"
+              value: "https://openai
            - name: LITELLM_MASTER_KEY
              value: "sk-1234"
-          ports:
-            - containerPort: 4000
+            - name: DATABASE_URL
+              value: "postgresql://ishaan:*********
          args:
            - "--config"
            - "/app/proxy_config.yaml"  # Update the path to mount the config file
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -41,6 +41,35 @@ response = completion(
 ) 
 ```

+## Additional information in metadata
+You can send some additional information to Athina by using the `metadata` field in completion. This can be useful for sending metadata about the request, such as the customer_id, prompt_slug, or any other information you want to track.
+
+```python
+#openai call with additional metadata
+response = completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  metadata={
+    "environment": "staging",
+    "prompt_slug": "my_prompt_slug/v1"
+  }
+)
+```
+
+Following are the allowed fields in metadata, their types, and their descriptions:
+
+* `environment: Optional[str]` - Environment your app is running in (ex: production, staging, etc). This is useful for segmenting inference calls by environment.
+* `prompt_slug: Optional[str]` - Identifier for the prompt used for inference. This is useful for segmenting inference calls by prompt.
+* `customer_id: Optional[str]` - This is your customer ID. This is useful for segmenting inference calls by customer.
+* `customer_user_id: Optional[str]` - This is the end user ID. This is useful for segmenting inference calls by the end user.
+* `session_id: Optional[str]` - is the session or conversation ID. This is used for grouping different inferences into a conversation or chain. [Read more].(https://docs.athina.ai/logging/grouping_inferences)
+* `external_reference_id: Optional[str]` - This is useful if you want to associate your own internal identifier with the inference logged to Athina.
+* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
+* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
+* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+
 ## Support & Talk with Athina Team

 - [Schedule Demo 👋](https://cal.com/shiv-athina/30min)
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -62,7 +62,6 @@ model_list:

 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
-  set_verbose: True

 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -103,7 +103,10 @@ RUN chmod +x entrypoint.sh
 EXPOSE 4000/tcp

 # Override the CMD instruction with your desired command and arguments
-CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gunicorn"]
+# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
+# CMD ["--port", "4000", "--config", "config.yaml"]
+
+CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
 ```

 </TabItem>
@ -232,7 +235,6 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
 | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |

-
 ## Deploy with Database
 ### Docker, Kubernetes, Helm Chart

@ -474,25 +476,6 @@ docker run --name litellm-proxy \
 ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
 ```

-## Best Practices for Deploying to Production
-### 1. Switch of debug logs in production 
-don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
-
-### 2. Use `run_gunicorn` and `num_workers`
-
-Example setting `--run_gunicorn` and `--num_workers`
-```shell
-docker run ghcr.io/berriai/litellm-database:main-latest --run_gunicorn --num_workers 4
-```
-
-Why `Gunicorn`?
- Gunicorn takes care of running multiple instances of your web application
- Gunicorn is ideal for running litellm proxy on cluster of machines with Kubernetes
-
-Why `num_workers`? 
-Setting `num_workers` to the number of CPUs available ensures optimal utilization of system resources by matching the number of worker processes to the available CPU cores.
-
-
 ## Advanced Deployment Settings

 ### Customization of the server root path
@ -525,6 +508,57 @@ Provide an ssl certificate when starting litellm proxy server
 ## Platform-specific Guide

 <Tabs>
+<TabItem value="AWS EKS" label="AWS EKS - Kubernetes">
+
+### Kubernetes - Deploy on EKS
+
+Step1. Create an EKS Cluster with the following spec
+
+```shell
+eksctl create cluster --name=litellm-cluster --region=us-west-2 --node-type=t2.small
+```
+
+Step 2. Mount litellm proxy config on kub cluster 
+
+This will mount your local file called `proxy_config.yaml` on kubernetes cluster
+
+```shell
+kubectl create configmap litellm-config --from-file=proxy_config.yaml
+```
+
+Step 3. Apply `kub.yaml` and `service.yaml`
+Clone the following `kub.yaml` and `service.yaml` files and apply locally
+
+- Use this `kub.yaml` file - [litellm kub.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/kub.yaml)
+
+- Use this `service.yaml` file - [litellm service.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/service.yaml)
+
+Apply `kub.yaml`
+```
+kubectl apply -f kub.yaml
+```
+
+Apply `service.yaml` - creates an AWS load balancer to expose the proxy
+```
+kubectl apply -f service.yaml
+
+# service/litellm-service created
+```
+
+Step 4. Get Proxy Base URL
+
+```shell
+kubectl get services
+
+# litellm-service   LoadBalancer   10.100.6.31   a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com   4000:30374/TCP   63m
+```
+
+Proxy Base URL =  `a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com:4000`
+
+That's it, now you can start using LiteLLM Proxy
+
+</TabItem>
+

 <TabItem value="aws-stack" label="AWS Cloud Formation Stack">

--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -0,0 +1,138 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# ⚡ Best Practices for Production
+
+Expected Performance in Production
+
+1 LiteLLM Uvicorn Worker on Kubernetes
+
+| Description | Value |
+|--------------|-------|
+| Avg latency | `50ms` |
+| Median latency | `51ms` |
+| `/chat/completions` Requests/second | `35` |
+| `/chat/completions` Requests/minute | `2100` |
+| `/chat/completions` Requests/hour | `126K` |
+
+
+## 1. Switch of Debug Logging
+
+Remove `set_verbose: True` from your config.yaml
+```yaml
+litellm_settings:
+  set_verbose: True
+```
+
+You should only see the following level of details in logs on the proxy server
+```shell
+# INFO:     192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK
+# INFO:     192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
+# INFO:     192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
+```
+
+## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
+
+Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
+
+(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD). 
+```shell
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+```
+
+## 3. Switch off spend logging and resetting budgets
+
+Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
+```yaml
+general_settings:
+  disable_spend_logs: true
+  disable_reset_budget: true
+```
+
+## Machine Specifications to Deploy LiteLLM
+
+| Service | Spec | CPUs | Memory | Architecture | Version|
+| --- | --- | --- | --- | --- | --- | 
+| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
+| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
+
+
+## Reference Kubernetes Deployment YAML
+
+Reference Kubernetes `deployment.yaml` that was load tested by us
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-deployment
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+        - name: litellm-container
+          image: ghcr.io/berriai/litellm:main-latest
+          env:
+            - name: AZURE_API_KEY
+              value: "d6******"
+            - name: AZURE_API_BASE
+              value: "https://ope******"
+            - name: LITELLM_MASTER_KEY
+              value: "sk-1234"
+            - name: DATABASE_URL
+              value: "po**********"
+          args:
+            - "--config"
+            - "/app/proxy_config.yaml"  # Update the path to mount the config file
+          volumeMounts:                 # Define volume mount for proxy_config.yaml
+            - name: config-volume
+              mountPath: /app
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+      volumes:  # Define volume to mount proxy_config.yaml
+        - name: config-volume
+          configMap:
+            name: litellm-config  
+
+```
+
+
+Reference Kubernetes `service.yaml` that was load tested by us
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-service
+spec:
+  selector:
+    app: litellm
+  ports:
+    - protocol: TCP
+      port: 4000
+      targetPort: 4000
+  type: LoadBalancer
+```
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -551,6 +551,156 @@ router = Router(model_list: Optional[list] = None,
 				 cache_responses=True)
 ```

+## Pre-Call Checks (Context Window)
+
+Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+**1. Enable pre-call checks**
+```python 
+from litellm import Router 
+# ...
+router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
+```
+
+**2. (Azure-only) Set base model**
+
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
+
+```python
+model_list = [
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+				"model_info": {
+					"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
+				}
+            },
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+```
+
+**3. Test it!**
+
+```python
+"""
+- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+- Send a 5k prompt
+- Assert it works
+"""
+from litellm import Router
+import os
+
+try:
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
+			"api_key": os.getenv("AZURE_API_KEY"),
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE"),
+		},
+		"model_info": {
+			"base_model": "azure/gpt-35-turbo", 
+		}
+	},
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo-1106",
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+	},
+]
+
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
+
+text = "What is the meaning of 42?" * 5000
+
+response = router.completion(
+	model="gpt-3.5-turbo",
+	messages=[
+		{"role": "system", "content": text},
+		{"role": "user", "content": "Who was Alexander?"},
+	],
+)
+
+print(f"response: {response}")
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Setup config**
+
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with azure/.
+
+```yaml
+router_settings:
+	enable_pre_call_checks: true # 1. Enable pre-call checks
+
+model_list:
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+		model: azure/chatgpt-v-2
+		api_base: os.environ/AZURE_API_BASE
+		api_key: os.environ/AZURE_API_KEY
+		api_version: "2023-07-01-preview"
+	  model_info:
+		base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
+	
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+		model: gpt-3.5-turbo-1106
+		api_key: os.environ/OPENAI_API_KEY
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+text = "What is the meaning of 42?" * 5000
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {"role": "system", "content": text},
+		{"role": "user", "content": "Who was Alexander?"},
+    ],
+)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
 ## Caching across model groups

 If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -30,6 +30,7 @@ const sidebars = {
      items: [
        "proxy/quick_start", 
        "proxy/deploy", 
+        "proxy/prod", 
        "proxy/configs",
        {
          type: 'link',
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -10,7 +10,7 @@ class AthinaLogger:
            "Content-Type": "application/json"
        }
        self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
-        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response"]
+        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        import requests
@ -32,8 +32,6 @@ class AthinaLogger:

            if "messages" in kwargs:
                data["prompt"] = kwargs.get("messages", None)
-                if kwargs.get("messages") and len(kwargs.get("messages")) > 0:
-                    data["user_query"] = kwargs.get("messages")[0].get("content", None)

            # Directly add tools or functions if present
            optional_params = kwargs.get("optional_params", {})
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -129,7 +129,15 @@ class AmazonAnthropicClaude3Config:
        }

    def get_supported_openai_params(self):
-        return ["max_tokens", "tools", "tool_choice", "stream"]
+        return [
+            "max_tokens",
+            "tools",
+            "tool_choice",
+            "stream",
+            "stop",
+            "temperature",
+            "top_p",
+        ]

    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
--- a/litellm/main.py
+++ b/litellm/main.py
@ -572,6 +572,7 @@ def completion(
        "ttl",
        "cache",
        "no-log",
+        "base_model",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -12,9 +12,6 @@ model_list:
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings:
  master_key: sk-1234
-router_settings:
-  set_verbose: True
-  debug_level: "DEBUG"
-litellm_settings:
-  success_callback: ["prometheus"]
+  disable_spend_logs: true
+  disable_reset_budget: true
  num_retries: 2
--- a/litellm/proxy/proxy_load_test/locustfile.py
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@ -18,7 +18,10 @@ class MyUser(HttpUser):
        payload = {
            "model": "fake-openai-endpoint",
            "messages": [
-                {"role": "system", "content": "You are a chat bot."},
+                {
+                    "role": "system",
+                    "content": "this is a very sweet test message from ishaan",
+                },
                {"role": "user", "content": "Hello, how are you?"},
            ],
            # Add more data as necessary
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -6409,6 +6409,9 @@ async def add_new_model(model_params: ModelParams):
 async def model_info_v2(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
+    """
+    BETA ENDPOINT. Might change unexpectedly. Use `/v1/model/info` for now.
+    """
    global llm_model_list, general_settings, user_config_file_path, proxy_config

    # Load existing config
@ -6550,7 +6553,7 @@ async def model_info_v1(

    if len(user_api_key_dict.models) > 0:
        model_names = user_api_key_dict.models
-        all_models = [m for m in config["model_list"] if m in model_names]
+        all_models = [m for m in config["model_list"] if m["model_name"] in model_names]
    else:
        all_models = config["model_list"]
    for model in all_models:
--- a/litellm/proxy/tests/large_text.py
+++ b/litellm/proxy/tests/large_text.py
--- a/litellm/router.py
+++ b/litellm/router.py
@ -98,6 +98,7 @@ class Router:
        fallbacks: List = [],
        context_window_fallbacks: List = [],
        model_group_alias: Optional[dict] = {},
+        enable_pre_call_checks: bool = False,
        retry_after: int = 0,  # min time to wait before retrying a failed request
        allowed_fails: Optional[
            int
@ -131,6 +132,7 @@ class Router:
            debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
            fallbacks (List): List of fallback options. Defaults to [].
            context_window_fallbacks (List): List of context window fallback options. Defaults to [].
+            enable_pre_call_checks (boolean): Filter out deployments which are outside context window limits for a given prompt
            model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
            retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
            allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
@ -143,6 +145,7 @@ class Router:
        """
        self.set_verbose = set_verbose
        self.debug_level = debug_level
+        self.enable_pre_call_checks = enable_pre_call_checks
        if self.set_verbose == True:
            if debug_level == "INFO":
                verbose_router_logger.setLevel(logging.INFO)
@ -2150,6 +2153,62 @@ class Router:
                    client = self.cache.get_cache(key=cache_key)
                return client

+    def _pre_call_checks(
+        self,
+        model: str,
+        healthy_deployments: List,
+        messages: List[Dict[str, str]],
+    ):
+        """
+        Filter out model in model group, if:
+
+        - model context window < message length
+        - function call and model doesn't support function calling
+        """
+        verbose_router_logger.debug(
+            f"Starting Pre-call checks for deployments in model={model}"
+        )
+
+        _returned_deployments = copy.deepcopy(healthy_deployments)
+
+        invalid_model_indices = []
+
+        try:
+            input_tokens = litellm.token_counter(messages=messages)
+        except Exception as e:
+            return _returned_deployments
+
+        for idx, deployment in enumerate(_returned_deployments):
+            # see if we have the info for this model
+            try:
+                base_model = deployment.get("model_info", {}).get("base_model", None)
+                if base_model is None:
+                    base_model = deployment.get("litellm_params", {}).get(
+                        "base_model", None
+                    )
+                model = base_model or deployment.get("litellm_params", {}).get(
+                    "model", None
+                )
+                model_info = litellm.get_model_info(model=model)
+            except:
+                continue
+
+            if (
+                isinstance(model_info, dict)
+                and model_info.get("max_input_tokens", None) is not None
+            ):
+                if (
+                    isinstance(model_info["max_input_tokens"], int)
+                    and input_tokens > model_info["max_input_tokens"]
+                ):
+                    invalid_model_indices.append(idx)
+
+        if len(invalid_model_indices) > 0:
+            for idx in reversed(invalid_model_indices):
+                _returned_deployments.pop(idx)
+
+        return _returned_deployments
+
    def get_available_deployment(
        self,
        model: str,
@ -2209,6 +2268,12 @@ class Router:
        for deployment in deployments_to_remove:
            healthy_deployments.remove(deployment)

+        # filter pre-call checks
+        if self.enable_pre_call_checks and messages is not None:
+            healthy_deployments = self._pre_call_checks(
+                model=model, healthy_deployments=healthy_deployments, messages=messages
+            )
+
        verbose_router_logger.debug(
            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
        )
--- a/litellm/tests/large_text.py
+++ b/litellm/tests/large_text.py
@ -0,0 +1,112 @@
+text = """
+Alexander the Great
+This article is about the ancient king of Macedonia. For other uses, see Alexander the Great (disambiguation).
+Alexander III of Macedon (Ancient Greek: Ἀλέξανδρος, romanized: Alexandros; 20/21 July 356 BC – 10/11 June 323 BC), most commonly known as Alexander the Great,[c] was a king of the ancient Greek kingdom of Macedon.[d] He succeeded his father Philip II to the throne in 336 BC at the age of 20 and spent most of his ruling years conducting a lengthy military campaign throughout Western Asia, Central Asia, parts of South Asia, and Egypt. By the age of 30, he had created one of the largest empires in history, stretching from Greece to northwestern India.[1] He was undefeated in battle and is widely considered to be one of history's greatest and most successful military commanders.[2][3]
+
+Until the age of 16, Alexander was tutored by Aristotle. In 335 BC, shortly after his assumption of kingship over Macedon, he campaigned in the Balkans and reasserted control over Thrace and parts of Illyria before marching on the city of Thebes, which was subsequently destroyed in battle. Alexander then led the League of Corinth, and used his authority to launch the pan-Hellenic project envisaged by his father, assuming leadership over all Greeks in their conquest of Persia.[4][5]
+
+In 334 BC, he invaded the Achaemenid Persian Empire and began a series of campaigns that lasted for 10 years. Following his conquest of Asia Minor, Alexander broke the power of Achaemenid Persia in a series of decisive battles, including those at Issus and Gaugamela; he subsequently overthrew Darius III and conquered the Achaemenid Empire in its entirety.[e] After the fall of Persia, the Macedonian Empire held a vast swath of territory between the Adriatic Sea and the Indus River. Alexander endeavored to reach the "ends of the world and the Great Outer Sea" and invaded India in 326 BC, achieving an important victory over Porus, an ancient Indian king of present-day Punjab, at the Battle of the Hydaspes. Due to the demand of his homesick troops, he eventually turned back at the Beas River and later died in 323 BC in Babylon, the city of Mesopotamia that he had planned to establish as his empire's capital. Alexander's death left unexecuted an additional series of planned military and mercantile campaigns that would have begun with a Greek invasion of Arabia. In the years following his death, a series of civil wars broke out across the Macedonian Empire, eventually leading to its disintegration at the hands of the Diadochi.
+
+With his death marking the start of the Hellenistic period, Alexander's legacy includes the cultural diffusion and syncretism that his conquests engendered, such as Greco-Buddhism and Hellenistic Judaism. He founded more than twenty cities, with the most prominent being the city of Alexandria in Egypt. Alexander's settlement of Greek colonists and the resulting spread of Greek culture led to the overwhelming dominance of Hellenistic civilization and influence as far east as the Indian subcontinent. The Hellenistic period developed through the Roman Empire into modern Western culture; the Greek language became the lingua franca of the region and was the predominant language of the Byzantine Empire up until its collapse in the mid-15th century AD. Alexander became legendary as a classical hero in the mould of Achilles, featuring prominently in the historical and mythical traditions of both Greek and non-Greek cultures. His military achievements and unprecedented enduring successes in battle made him the measure against which many later military leaders would compare themselves,[f] and his tactics remain a significant subject of study in military academies worldwide.[6] Legends of Alexander's exploits coalesced into the third-century Alexander Romance which, in the premodern period, went through over one hundred recensions, translations, and derivations and was translated into almost every European vernacular and every language of the Islamic world.[7] After the Bible, it was the most popular form of European literature.[8]
+
+Early life
+
+Lineage and childhood
+
+Alexander III was born in Pella, the capital of the Kingdom of Macedon,[9] on the sixth day of the ancient Greek month of Hekatombaion, which probably corresponds to 20 July 356 BC (although the exact date is uncertain).[10][11] He was the son of the erstwhile king of Macedon, Philip II, and his fourth wife, Olympias (daughter of Neoptolemus I, king of Epirus).[12][g] Although Philip had seven or eight wives, Olympias was his principal wife for some time, likely because she gave birth to Alexander.[13]
+
+Several legends surround Alexander's birth and childhood.[14] According to the ancient Greek biographer Plutarch, on the eve of the consummation of her marriage to Philip, Olympias dreamed that her womb was struck by a thunderbolt that caused a flame to spread "far and wide" before dying away. Sometime after the wedding, Philip is said to have seen himself, in a dream, securing his wife's womb with a seal engraved with a lion's image.[15] Plutarch offered a variety of interpretations for these dreams: that Olympias was pregnant before her marriage, indicated by the sealing of her womb; or that Alexander's father was Zeus. Ancient commentators were divided about whether the ambitious Olympias promulgated the story of Alexander's divine parentage, variously claiming that she had told Alexander, or that she dismissed the suggestion as impious.[15]
+
+On the day Alexander was born, Philip was preparing a siege on the city of Potidea on the peninsula of Chalcidice. That same day, Philip received news that his general Parmenion had defeated the combined Illyrian and Paeonian armies and that his horses had won at the Olympic Games. It was also said that on this day, the Temple of Artemis in Ephesus, one of the Seven Wonders of the World, burnt down. This led Hegesias of Magnesia to say that it had burnt down because Artemis was away, attending the birth of Alexander.[16] Such legends may have emerged when Alexander was king, and possibly at his instigation, to show that he was superhuman and destined for greatness from conception.[14]
+
+In his early years, Alexander was raised by a nurse, Lanike, sister of Alexander's future general Cleitus the Black. Later in his childhood, Alexander was tutored by the strict Leonidas, a relative of his mother, and by Lysimachus of Acarnania.[17] Alexander was raised in the manner of noble Macedonian youths, learning to read, play the lyre, ride, fight, and hunt.[18] When Alexander was ten years old, a trader from Thessaly brought Philip a horse, which he offered to sell for thirteen talents. The horse refused to be mounted, and Philip ordered it away. Alexander, however, detecting the horse's fear of its own shadow, asked to tame the horse, which he eventually managed.[14] Plutarch stated that Philip, overjoyed at this display of courage and ambition, kissed his son tearfully, declaring: "My boy, you must find a kingdom big enough for your ambitions. Macedon is too small for you", and bought the horse for him.[19] Alexander named it Bucephalas, meaning "ox-head". Bucephalas carried Alexander as far as India. When the animal died (because of old age, according to Plutarch, at age 30), Alexander named a city after him, Bucephala.[20]
+
+Education
+
+When Alexander was 13, Philip began to search for a tutor, and considered such academics as Isocrates and Speusippus, the latter offering to resign from his stewardship of the Academy to take up the post. In the end, Philip chose Aristotle and provided the Temple of the Nymphs at Mieza as a classroom. In return for teaching Alexander, Philip agreed to rebuild Aristotle's hometown of Stageira, which Philip had razed, and to repopulate it by buying and freeing the ex-citizens who were slaves, or pardoning those who were in exile.[21]
+
+Mieza was like a boarding school for Alexander and the children of Macedonian nobles, such as Ptolemy, Hephaistion, and Cassander. Many of these students would become his friends and future generals, and are often known as the "Companions". Aristotle taught Alexander and his companions about medicine, philosophy, morals, religion, logic, and art. Under Aristotle's tutelage, Alexander developed a passion for the works of Homer, and in particular the Iliad; Aristotle gave him an annotated copy, which Alexander later carried on his campaigns.[22] Alexander was able to quote Euripides from memory.[23]
+
+During his youth, Alexander was also acquainted with Persian exiles at the Macedonian court, who received the protection of Philip II for several years as they opposed Artaxerxes III.[24][25][26] Among them were Artabazos II and his daughter Barsine, possible future mistress of Alexander, who resided at the Macedonian court from 352 to 342 BC, as well as Amminapes, future satrap of Alexander, or a Persian nobleman named Sisines.[24][27][28][29] This gave the Macedonian court a good knowledge of Persian issues, and may even have influenced some of the innovations in the management of the Macedonian state.[27]
+
+Suda writes that Anaximenes of Lampsacus was one of Alexander's teachers, and that Anaximenes also accompanied Alexander on his campaigns.[30]
+
+Heir of Philip II
+
+Regency and ascent of Macedon
+
+Main articles: Philip II of Macedon and Rise of Macedon
+Further information: History of Macedonia (ancient kingdom)
+At the age of 16, Alexander's education under Aristotle ended. Philip II had waged war against the Thracians to the north, which left Alexander in charge as regent and heir apparent.[14] During Philip's absence, the Thracian tribe of Maedi revolted against Macedonia. Alexander responded quickly and drove them from their territory. The territory was colonized, and a city, named Alexandropolis, was founded.[31]
+
+Upon Philip's return, Alexander was dispatched with a small force to subdue the revolts in southern Thrace. Campaigning against the Greek city of Perinthus, Alexander reportedly saved his father's life. Meanwhile, the city of Amphissa began to work lands that were sacred to Apollo near Delphi, a sacrilege that gave Philip the opportunity to further intervene in Greek affairs. While Philip was occupied in Thrace, Alexander was ordered to muster an army for a campaign in southern Greece. Concerned that other Greek states might intervene, Alexander made it look as though he was preparing to attack Illyria instead. During this turmoil, the Illyrians invaded Macedonia, only to be repelled by Alexander.[32]
+
+Philip and his army joined his son in 338 BC, and they marched south through Thermopylae, taking it after stubborn resistance from its Theban garrison. They went on to occupy the city of Elatea, only a few days' march from both Athens and Thebes. The Athenians, led by Demosthenes, voted to seek alliance with Thebes against Macedonia. Both Athens and Philip sent embassies to win Thebes's favour, but Athens won the contest.[33] Philip marched on Amphissa (ostensibly acting on the request of the Amphictyonic League), capturing the mercenaries sent there by Demosthenes and accepting the city's surrender. Philip then returned to Elatea, sending a final offer of peace to Athens and Thebes, who both rejected it.[34]
+
+As Philip marched south, his opponents blocked him near Chaeronea, Boeotia. During the ensuing Battle of Chaeronea, Philip commanded the right wing and Alexander the left, accompanied by a group of Philip's trusted generals. According to the ancient sources, the two sides fought bitterly for some time. Philip deliberately commanded his troops to retreat, counting on the untested Athenian hoplites to follow, thus breaking their line. Alexander was the first to break the Theban lines, followed by Philip's generals. Having damaged the enemy's cohesion, Philip ordered his troops to press forward and quickly routed them. With the Athenians lost, the Thebans were surrounded. Left to fight alone, they were defeated.[35]
+
+After the victory at Chaeronea, Philip and Alexander marched unopposed into the Peloponnese, welcomed by all cities; however, when they reached Sparta, they were refused, but did not resort to war.[36] At Corinth, Philip established a "Hellenic Alliance" (modelled on the old anti-Persian alliance of the Greco-Persian Wars), which included most Greek city-states except Sparta. Philip was then named Hegemon (often translated as "Supreme Commander") of this league (known by modern scholars as the League of Corinth), and announced his plans to attack the Persian Empire.[37][38]
+
+Exile and return
+
+When Philip returned to Pella, he fell in love with and married Cleopatra Eurydice in 338 BC,[39] the niece of his general Attalus.[40] The marriage made Alexander's position as heir less secure, since any son of Cleopatra Eurydice would be a fully Macedonian heir, while Alexander was only half-Macedonian.[41] During the wedding banquet, a drunken Attalus publicly prayed to the gods that the union would produce a legitimate heir.[40]
+
+At the wedding of Cleopatra, whom Philip fell in love with and married, she being much too young for him, her uncle Attalus in his drink desired the Macedonians would implore the gods to give them a lawful successor to the kingdom by his niece. This so irritated Alexander, that throwing one of the cups at his head, "You villain," said he, "what, am I then a bastard?" Then Philip, taking Attalus's part, rose up and would have run his son through; but by good fortune for them both, either his over-hasty rage, or the wine he had drunk, made his foot slip, so that he fell down on the floor. At which Alexander reproachfully insulted over him: "See there," said he, "the man who makes preparations to pass out of Europe into Asia, overturned in passing from one seat to another."
+
+— Plutarch, describing the feud at Philip's wedding.[42]none
+In 337 BC, Alexander fled Macedon with his mother, dropping her off with her brother, King Alexander I of Epirus in Dodona, capital of the Molossians.[43] He continued to Illyria,[43] where he sought refuge with one or more Illyrian kings, perhaps with Glaucias, and was treated as a guest, despite having defeated them in battle a few years before.[44] However, it appears Philip never intended to disown his politically and militarily trained son.[43] Accordingly, Alexander returned to Macedon after six months due to the efforts of a family friend, Demaratus, who mediated between the two parties.[45]
+
+In the following year, the Persian satrap (governor) of Caria, Pixodarus, offered his eldest daughter to Alexander's half-brother, Philip Arrhidaeus.[43] Olympias and several of Alexander's friends suggested this showed Philip intended to make Arrhidaeus his heir.[43] Alexander reacted by sending an actor, Thessalus of Corinth, to tell Pixodarus that he should not offer his daughter's hand to an illegitimate son, but instead to Alexander. When Philip heard of this, he stopped the negotiations and scolded Alexander for wishing to marry the daughter of a Carian, explaining that he wanted a better bride for him.[43] Philip exiled four of Alexander's friends, Harpalus, Nearchus, Ptolemy and Erigyius, and had the Corinthians bring Thessalus to him in chains.[46]
+
+King of Macedon
+
+Accession
+
+Further information: Government of Macedonia (ancient kingdom)
+In summer 336 BC, while at Aegae attending the wedding of his daughter Cleopatra to Olympias's brother, Alexander I of Epirus, Philip was assassinated by the captain of his bodyguards, Pausanias.[h] As Pausanias tried to escape, he tripped over a vine and was killed by his pursuers, including two of Alexander's companions, Perdiccas and Leonnatus. Alexander was proclaimed king on the spot by the nobles and army at the age of 20.[47][48][49]
+
+Consolidation of power
+
+Alexander began his reign by eliminating potential rivals to the throne. He had his cousin, the former Amyntas IV, executed.[51] He also had two Macedonian princes from the region of Lyncestis killed for having been involved in his father's assassination, but spared a third, Alexander Lyncestes. Olympias had Cleopatra Eurydice, and Europa, her daughter by Philip, burned alive. When Alexander learned about this, he was furious. Alexander also ordered the murder of Attalus,[51] who was in command of the advance guard of the army in Asia Minor and Cleopatra's uncle.[52]
+
+Attalus was at that time corresponding with Demosthenes, regarding the possibility of defecting to Athens. Attalus also had severely insulted Alexander, and following Cleopatra's murder, Alexander may have considered him too dangerous to be left alive.[52] Alexander spared Arrhidaeus, who was by all accounts mentally disabled, possibly as a result of poisoning by Olympias.[47][49][53]
+
+News of Philip's death roused many states into revolt, including Thebes, Athens, Thessaly, and the Thracian tribes north of Macedon. When news of the revolts reached Alexander, he responded quickly. Though advised to use diplomacy, Alexander mustered 3,000 Macedonian cavalry and rode south towards Thessaly. He found the Thessalian army occupying the pass between Mount Olympus and Mount Ossa, and ordered his men to ride over Mount Ossa. When the Thessalians awoke the next day, they found Alexander in their rear and promptly surrendered, adding their cavalry to Alexander's force. He then continued south towards the Peloponnese.[54]
+
+Alexander stopped at Thermopylae, where he was recognized as the leader of the Amphictyonic League before heading south to Corinth. Athens sued for peace and Alexander pardoned the rebels. The famous encounter between Alexander and Diogenes the Cynic occurred during Alexander's stay in Corinth. When Alexander asked Diogenes what he could do for him, the philosopher disdainfully asked Alexander to stand a little to the side, as he was blocking the sunlight.[55] This reply apparently delighted Alexander, who is reported to have said "But verily, if I were not Alexander, I would like to be Diogenes."[56] At Corinth, Alexander took the title of Hegemon ("leader") and, like Philip, was appointed commander for the coming war against Persia. He also received news of a Thracian uprising.[57]
+
+Balkan campaign
+
+Main article: Alexander's Balkan campaign
+Before crossing to Asia, Alexander wanted to safeguard his northern borders. In the spring of 335 BC, he advanced to suppress several revolts. Starting from Amphipolis, he travelled east into the country of the "Independent Thracians"; and at Mount Haemus, the Macedonian army attacked and defeated the Thracian forces manning the heights.[58] The Macedonians marched into the country of the Triballi, and defeated their army near the Lyginus river[59] (a tributary of the Danube). Alexander then marched for three days to the Danube, encountering the Getae tribe on the opposite shore. Crossing the river at night, he surprised them and forced their army to retreat after the first cavalry skirmish.[60]
+
+News then reached Alexander that the Illyrian chieftain Cleitus and King Glaukias of the Taulantii were in open revolt against his authority. Marching west into Illyria, Alexander defeated each in turn, forcing the two rulers to flee with their troops. With these victories, he secured his northern frontier.[61]
+
+Destruction of Thebes
+
+While Alexander campaigned north, the Thebans and Athenians rebelled once again. Alexander immediately headed south.[62] While the other cities again hesitated, Thebes decided to fight. The Theban resistance was ineffective, and Alexander razed the city and divided its territory between the other Boeotian cities. The end of Thebes cowed Athens, leaving all of Greece temporarily at peace.[62] Alexander then set out on his Asian campaign, leaving Antipater as regent.[63]
+
+Conquest of the Achaemenid Persian Empire
+
+Main articles: Wars of Alexander the Great and Chronology of the expedition of Alexander the Great into Asia
+Asia Minor
+
+Further information: Battle of the Granicus, Siege of Halicarnassus, and Siege of Miletus
+After his victory at the Battle of Chaeronea (338 BC), Philip II began the work of establishing himself as hēgemṓn (Greek: ἡγεμών) of a league which according to Diodorus was to wage a campaign against the Persians for the sundry grievances Greece suffered in 480 and free the Greek cities of the western coast and islands from Achaemenid rule. In 336 he sent Parmenion, Amyntas, Andromenes, Attalus, and an army of 10,000 men into Anatolia to make preparations for an invasion.[64][65] At first, all went well. The Greek cities on the western coast of Anatolia revolted until the news arrived that Philip had been murdered and had been succeeded by his young son Alexander. The Macedonians were demoralized by Philip's death and were subsequently defeated near Magnesia by the Achaemenids under the command of the mercenary Memnon of Rhodes.[64][65]
+
+Taking over the invasion project of Philip II, Alexander's army crossed the Hellespont in 334 BC with approximately 48,100 soldiers, 6,100 cavalry and a fleet of 120 ships with crews numbering 38,000,[62] drawn from Macedon and various Greek city-states, mercenaries, and feudally raised soldiers from Thrace, Paionia, and Illyria.[66][i] He showed his intent to conquer the entirety of the Persian Empire by throwing a spear into Asian soil and saying he accepted Asia as a gift from the gods. This also showed Alexander's eagerness to fight, in contrast to his father's preference for diplomacy.[62]
+
+After an initial victory against Persian forces at the Battle of the Granicus, Alexander accepted the surrender of the Persian provincial capital and treasury of Sardis; he then proceeded along the Ionian coast, granting autonomy and democracy to the cities. Miletus, held by Achaemenid forces, required a delicate siege operation, with Persian naval forces nearby. Further south, at Halicarnassus, in Caria, Alexander successfully waged his first large-scale siege, eventually forcing his opponents, the mercenary captain Memnon of Rhodes and the Persian satrap of Caria, Orontobates, to withdraw by sea.[67] Alexander left the government of Caria to a member of the Hecatomnid dynasty, Ada, who adopted Alexander.[68]
+
+From Halicarnassus, Alexander proceeded into mountainous Lycia and the Pamphylian plain, asserting control over all coastal cities to deny the Persians naval bases. From Pamphylia onwards the coast held no major ports and Alexander moved inland. At Termessos, Alexander humbled but did not storm the Pisidian city.[69] At the ancient Phrygian capital of Gordium, Alexander "undid" the hitherto unsolvable Gordian Knot, a feat said to await the future "king of Asia".[70] According to the story, Alexander proclaimed that it did not matter how the knot was undone and hacked it apart with his sword.[71]
+
+The Levant and Syria
+
+Further information: Battle of Issus and Siege of Tyre (332 BC)
+In spring 333 BC, Alexander crossed the Taurus into Cilicia. After a long pause due to an illness, he marched on towards Syria. Though outmanoeuvered by Darius's significantly larger army, he marched back to Cilicia, where he defeated Darius at Issus. Darius fled the battle, causing his army to collapse, and left behind his wife, his two daughters, his mother Sisygambis, and a fabulous treasure.[72] He offered a peace treaty that included the lands he had already lost, and a ransom of 10,000 talents for his family. Alexander replied that since he was now king of Asia, it was he alone who decided territorial divisions.[73] Alexander proceeded to take possession of Syria, and most of the coast of the Levant.[68] In the following year, 332 BC, he was forced to attack Tyre, which he captured after a long and difficult siege.[74][75] The men of military age were massacred and the women and children sold into slavery.[76]
+
+Egypt
+
+Further information: Siege of Gaza (332 BCE)
+When Alexander destroyed Tyre, most of the towns on the route to Egypt quickly capitulated. However, Alexander was met with resistance at Gaza. The stronghold was heavily fortified and built on a hill, requiring a siege. When "his engineers pointed out to him that because of the height of the mound it would be impossible... this encouraged Alexander all the more to make the attempt".[77] After three unsuccessful assaults, the stronghold fell, but not before Alexander had received a serious shoulder wound. As in Tyre, men of military age were put to the sword and the women and children were sold into slavery.[78]
+"""
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@ -214,6 +214,7 @@ def test_bedrock_claude_3():
            model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
            messages=messages,
            max_tokens=10,
+            temperature=0.78,
        )
        # Add any assertions here to check the response
        assert len(response.choices) > 0
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -102,7 +102,20 @@ def test_completion_claude_3_empty_response():

 def test_completion_claude_3():
    litellm.set_verbose = True
-    messages = [{"role": "user", "content": "Hello, world"}]
+    messages = [
+        {
+            "role": "user",
+            "content": "\nWhat is the query for `console.log` => `console.error`\n",
+        },
+        {
+            "role": "assistant",
+            "content": "\nThis is the GritQL query for the given before/after examples:\n<gritql>\n`console.log` => `console.error`\n</gritql>\n",
+        },
+        {
+            "role": "user",
+            "content": "\nWhat is the query for `console.info` => `consdole.heaven`\n",
+        },
+    ]
    try:
        # test without max tokens
        response = completion(
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -297,6 +297,55 @@ def test_router_azure_acompletion():

 # test_router_azure_acompletion()

+
+def test_router_context_window_check():
+    """
+    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+    - Send a 5k prompt
+    - Assert it works
+    """
+    from large_text import text
+    import os
+
+    litellm.set_verbose = False
+
+    print(f"len(text): {len(text)}")
+    try:
+        model_list = [
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "base_model": "azure/gpt-35-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+
+        router = Router(model_list=model_list, set_verbose=True, enable_pre_call_checks=True, num_retries=0)  # type: ignore
+
+        response = router.completion(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": text},
+                {"role": "user", "content": "Who was Alexander?"},
+            ],
+        )
+
+        print(f"response: {response}")
+    except Exception as e:
+        pytest.fail(f"Got unexpected exception on router! - {str(e)}")
+
+
 ### FUNCTION CALLING


--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3605,7 +3605,7 @@ def token_counter(
                    count_response_tokens=count_response_tokens,
                )
    else:
-        num_tokens = len(encoding.encode(text))  # type: ignore
+        num_tokens = len(encoding.encode(text, disallowed_special=()))  # type: ignore
    return num_tokens


--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.33.8"
+version = "1.34.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.33.8"
+version = "1.34.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@ -280,6 +280,29 @@ async def get_key_info(session, call_key, get_key=None):
        return await response.json()


+async def get_model_info(session, call_key):
+    """
+    Make sure only models user has access to are returned
+    """
+    url = "http://0.0.0.0:4000/model/info"
+    headers = {
+        "Authorization": f"Bearer {call_key}",
+        "Content-Type": "application/json",
+    }
+
+    async with session.get(url, headers=headers) as response:
+        status = response.status
+        response_text = await response.text()
+        print(response_text)
+        print()
+
+        if status != 200:
+            raise Exception(
+                f"Request did not return a 200 status code: {status}. Responses {response_text}"
+            )
+        return await response.json()
+
+
@pytest.mark.asyncio
 async def test_key_info():
    """
@ -305,6 +328,25 @@ async def test_key_info():
        assert status == 403


+@pytest.mark.asyncio
+async def test_model_info():
+    """
+    Get model info for models key has access to
+    """
+    async with aiohttp.ClientSession() as session:
+        key_gen = await generate_key(session=session, i=0)
+        key = key_gen["key"]
+        # as admin #
+        admin_models = await get_model_info(session=session, call_key="sk-1234")
+        admin_models = admin_models["data"]
+        # as key itself #
+        user_models = await get_model_info(session=session, call_key=key)
+        user_models = user_models["data"]
+
+        assert len(admin_models) > len(user_models)
+        assert len(user_models) > 0
+
+
 async def get_spend_logs(session, request_id):
    url = f"http://0.0.0.0:4000/spend/logs?request_id={request_id}"
    headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"}