diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000..51c578971
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,47 @@
+<!-- This is just examples. You can remove all items if you want. -->
+<!-- Please remove all comments. -->
+
+## Title
+
+<!-- e.g. "Implement user authentication feature" -->
+
+## Relevant issues
+
+<!-- e.g. "Fixes #000" -->
+
+## Type
+
+<!-- Select the type of Pull Request -->
+<!-- Keep only the necessary ones -->
+
+🆕 New Feature
+🐛 Bug Fix
+🧹 Refactoring
+📖 Documentation
+💻 Development Environment
+🚄 Infrastructure
+✅ Test
+
+## Changes
+
+<!-- List of changes -->
+
+## Testing
+
+<!-- Test procedure -->
+
+## Notes
+
+<!-- Test results -->
+
+<!-- Points to note for the reviewer, consultation content, concerns -->
+
+## Pre-Submission Checklist (optional but appreciated):
+
+- [ ] I have included relevant documentation updates (stored in /docs/my-website)
+
+## OS Tests (optional but appreciated):
+
+- [ ] Tested on Windows
+- [ ] Tested on MacOS
+- [ ] Tested on Linux
diff --git a/README.md b/README.md
index 38a166935..9344c0f22 100644
--- a/README.md
+++ b/README.md
@@ -248,7 +248,7 @@ Step 2: Navigate into the project, and install dependencies:
 
 ```
 cd litellm
-poetry install
+poetry install -E extra_proxy -E proxy
 ```
 
 Step 3: Test your change:
diff --git a/docs/my-website/docs/completion/token_usage.md b/docs/my-website/docs/completion/token_usage.md
index 626973c57..807ccfd91 100644
--- a/docs/my-website/docs/completion/token_usage.md
+++ b/docs/my-website/docs/completion/token_usage.md
@@ -1,7 +1,7 @@
 # Completion Token Usage & Cost
 By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
 
-However, we also expose 5 helper functions + **[NEW]** an API to calculate token usage across providers:
+However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers:
 
 - `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode)
 
@@ -9,17 +9,19 @@ However, we also expose 5 helper functions + **[NEW]** an API to calculate token
 
 - `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter)
 
-- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#4-cost_per_token)
+- `create_pretrained_tokenizer` and `create_tokenizer`: LiteLLM provides default tokenizer support for OpenAI, Cohere, Anthropic, Llama2, and Llama3 models. If you are using a different model, you can create a custom tokenizer and pass it as `custom_tokenizer` to the `encode`, `decode`, and `token_counter` methods. [**Jump to code**](#4-create_pretrained_tokenizer-and-create_tokenizer)
 
-- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#5-completion_cost)
+- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#5-cost_per_token)
 
-- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#6-get_max_tokens)
+- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#6-completion_cost)
 
-- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#7-model_cost)
+- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#7-get_max_tokens)
 
-- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#8-register_model)
+- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#8-model_cost)
 
-- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#9-apilitellmai)
+- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#9-register_model)
+
+- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai)
 
 📣 This is a community maintained list. Contributions are welcome! ❤️
 
@@ -60,7 +62,24 @@ messages = [{"user": "role", "content": "Hey, how's it going"}]
 print(token_counter(model="gpt-3.5-turbo", messages=messages))
 ```
 
-### 4. `cost_per_token`
+### 4. `create_pretrained_tokenizer` and `create_tokenizer`
+
+```python
+from litellm import create_pretrained_tokenizer, create_tokenizer
+
+# get tokenizer from huggingface repo
+custom_tokenizer_1 = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
+
+# use tokenizer from json file
+with open("tokenizer.json") as f:
+    json_data = json.load(f)
+
+json_str = json.dumps(json_data)
+
+custom_tokenizer_2 = create_tokenizer(json_str)
+```
+
+### 5. `cost_per_token`
 
 ```python
 from litellm import cost_per_token
@@ -72,7 +91,7 @@ prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_toke
 print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
 ```
 
-### 5. `completion_cost`
+### 6. `completion_cost`
 
 * Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings
 * Output: Returns a `float` of cost for the `completion` call 
@@ -99,7 +118,7 @@ cost = completion_cost(model="bedrock/anthropic.claude-v2", prompt="Hey!", compl
 formatted_string = f"${float(cost):.10f}"
 print(formatted_string)
 ```
-### 6. `get_max_tokens`
+### 7. `get_max_tokens`
 
 Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list).
 Output: Returns the maximum number of tokens allowed for the given model
@@ -112,7 +131,7 @@ model = "gpt-3.5-turbo"
 print(get_max_tokens(model)) # Output: 4097
 ```
 
-### 7. `model_cost`
+### 8. `model_cost`
 
 * Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
 
@@ -122,7 +141,7 @@ from litellm import model_cost
 print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...}
 ```
 
-### 8. `register_model`
+### 9. `register_model`
 
 * Input: Provide EITHER a model cost dictionary or a url to a hosted json blob
 * Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details.  
@@ -157,5 +176,3 @@ export LITELLM_LOCAL_MODEL_COST_MAP="True"
 ```
 
 Note: this means you will need to upgrade to get updated pricing, and newer models. 
-
-
diff --git a/docs/my-website/docs/observability/greenscale_integration.md b/docs/my-website/docs/observability/greenscale_integration.md
index 8fc2b7ea3..0dd673226 100644
--- a/docs/my-website/docs/observability/greenscale_integration.md
+++ b/docs/my-website/docs/observability/greenscale_integration.md
@@ -1,4 +1,4 @@
-# Greenscale Tutorial
+# Greenscale - Track LLM Spend and Responsible Usage
 
 [Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
 
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index ef2ddb57e..f5777d6e7 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -178,6 +178,7 @@ const sidebars = {
         "observability/traceloop_integration",
         "observability/athina_integration",
         "observability/lunary_integration",
+        "observability/greenscale_integration",
         "observability/helicone_integration",
         "observability/supabase_integration",
         `observability/telemetry`,
diff --git a/litellm-js/spend-logs/package-lock.json b/litellm-js/spend-logs/package-lock.json
index ef8cb1da0..cb4b599d3 100644
--- a/litellm-js/spend-logs/package-lock.json
+++ b/litellm-js/spend-logs/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "@hono/node-server": "^1.9.0",
+        "@hono/node-server": "^1.10.1",
         "hono": "^4.2.7"
       },
       "devDependencies": {
@@ -382,9 +382,9 @@
       }
     },
     "node_modules/@hono/node-server": {
-      "version": "1.9.0",
-      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
-      "integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.10.1.tgz",
+      "integrity": "sha512-5BKW25JH5PQKPDkTcIgv3yNUPtOAbnnjFFgWvIxxAY/B/ZNeYjjWoAeDmqhIiCgOAJ3Tauuw+0G+VainhuZRYQ==",
       "engines": {
         "node": ">=18.14.1"
       }
diff --git a/litellm-js/spend-logs/package.json b/litellm-js/spend-logs/package.json
index 92839a01b..d9543220b 100644
--- a/litellm-js/spend-logs/package.json
+++ b/litellm-js/spend-logs/package.json
@@ -3,7 +3,7 @@
     "dev": "tsx watch src/index.ts"
   },
   "dependencies": {
-    "@hono/node-server": "^1.9.0",
+    "@hono/node-server": "^1.10.1",
     "hono": "^4.2.7"
   },
   "devDependencies": {
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 5cc4d2316..dc640f0e9 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -542,7 +542,11 @@ models_by_provider: dict = {
     "together_ai": together_ai_models,
     "baseten": baseten_models,
     "openrouter": openrouter_models,
-    "vertex_ai": vertex_chat_models + vertex_text_models,
+    "vertex_ai": vertex_chat_models
+    + vertex_text_models
+    + vertex_anthropic_models
+    + vertex_vision_models
+    + vertex_language_models,
     "ai21": ai21_models,
     "bedrock": bedrock_models,
     "petals": petals_models,
@@ -609,6 +613,8 @@ from .utils import (
     get_optional_params,
     modify_integration,
     token_counter,
+    create_pretrained_tokenizer,
+    create_tokenizer,
     cost_per_token,
     completion_cost,
     supports_function_calling,
diff --git a/litellm/integrations/openmeter.py b/litellm/integrations/openmeter.py
index 2ed551c8d..248b83f4d 100644
--- a/litellm/integrations/openmeter.py
+++ b/litellm/integrations/openmeter.py
@@ -38,7 +38,7 @@ class OpenMeterLogger(CustomLogger):
         in the environment
         """
         missing_keys = []
-        if litellm.get_secret("OPENMETER_API_KEY", None) is None:
+        if os.getenv("OPENMETER_API_KEY", None) is None:
             missing_keys.append("OPENMETER_API_KEY")
 
         if len(missing_keys) > 0:
@@ -71,15 +71,13 @@ class OpenMeterLogger(CustomLogger):
         }
 
     def log_success_event(self, kwargs, response_obj, start_time, end_time):
-        _url = litellm.get_secret(
-            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
-        )
+        _url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud")
         if _url.endswith("/"):
             _url += "api/v1/events"
         else:
             _url += "/api/v1/events"
 
-        api_key = litellm.get_secret("OPENMETER_API_KEY")
+        api_key = os.getenv("OPENMETER_API_KEY")
 
         _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
         self.sync_http_handler.post(
@@ -92,15 +90,13 @@ class OpenMeterLogger(CustomLogger):
         )
 
     async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
-        _url = litellm.get_secret(
-            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
-        )
+        _url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud")
         if _url.endswith("/"):
             _url += "api/v1/events"
         else:
             _url += "/api/v1/events"
 
-        api_key = litellm.get_secret("OPENMETER_API_KEY")
+        api_key = os.getenv("OPENMETER_API_KEY")
 
         _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
         _headers = {
@@ -117,7 +113,6 @@ class OpenMeterLogger(CustomLogger):
 
             response.raise_for_status()
         except Exception as e:
-            print(f"\nAn Exception Occurred - {str(e)}")
             if hasattr(response, "text"):
-                print(f"\nError Message: {response.text}")
+                litellm.print_verbose(f"\nError Message: {response.text}")
             raise e
diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py
index 8f8ce712e..a9aba2f1c 100644
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@@ -48,19 +48,6 @@ class SlackAlerting:
         self.internal_usage_cache = DualCache()
         self.async_http_handler = AsyncHTTPHandler()
         self.alert_to_webhook_url = alert_to_webhook_url
-        self.langfuse_logger = None
-
-        try:
-            from litellm.integrations.langfuse import LangFuseLogger
-
-            self.langfuse_logger = LangFuseLogger(
-                os.getenv("LANGFUSE_PUBLIC_KEY"),
-                os.getenv("LANGFUSE_SECRET_KEY"),
-                flush_interval=1,
-            )
-        except:
-            pass
-
         pass
 
     def update_values(
@@ -110,62 +97,8 @@ class SlackAlerting:
         start_time: Optional[datetime.datetime] = None,
         end_time: Optional[datetime.datetime] = None,
     ):
-        import uuid
-
-        # For now: do nothing as we're debugging why this is not working as expected
-        if request_data is not None:
-            trace_id = request_data.get("metadata", {}).get(
-                "trace_id", None
-            )  # get langfuse trace id
-            if trace_id is None:
-                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
-                request_data["metadata"]["trace_id"] = trace_id
-        elif kwargs is not None:
-            _litellm_params = kwargs.get("litellm_params", {})
-            trace_id = _litellm_params.get("metadata", {}).get(
-                "trace_id", None
-            )  # get langfuse trace id
-            if trace_id is None:
-                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
-                _litellm_params["metadata"]["trace_id"] = trace_id
-
-        # Log hanging request as an error on langfuse
-        if type == "hanging_request":
-            if self.langfuse_logger is not None:
-                _logging_kwargs = copy.deepcopy(request_data)
-                if _logging_kwargs is None:
-                    _logging_kwargs = {}
-                _logging_kwargs["litellm_params"] = {}
-                request_data = request_data or {}
-                _logging_kwargs["litellm_params"]["metadata"] = request_data.get(
-                    "metadata", {}
-                )
-                # log to langfuse in a separate thread
-                import threading
-
-                threading.Thread(
-                    target=self.langfuse_logger.log_event,
-                    args=(
-                        _logging_kwargs,
-                        None,
-                        start_time,
-                        end_time,
-                        None,
-                        print,
-                        "ERROR",
-                        "Requests is hanging",
-                    ),
-                ).start()
-
-        _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
-        _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
-
-        # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
-
-        _langfuse_url = (
-            f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
-        )
-        request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
+        # do nothing for now
+        pass
         return request_info
 
     def _response_taking_too_long_callback(
@@ -242,10 +175,6 @@ class SlackAlerting:
         request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
         slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
         if time_difference_float > self.alerting_threshold:
-            if "langfuse" in litellm.success_callback:
-                request_info = self._add_langfuse_trace_id_to_alert(
-                    request_info=request_info, kwargs=kwargs, type="slow_response"
-                )
             # add deployment latencies to alert
             if (
                 kwargs is not None
diff --git a/litellm/main.py b/litellm/main.py
index 8fc07b9bf..98295de72 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -34,6 +34,8 @@ from litellm.utils import (
     async_mock_completion_streaming_obj,
     convert_to_model_response_object,
     token_counter,
+    create_pretrained_tokenizer,
+    create_tokenizer,
     Usage,
     get_optional_params_embeddings,
     get_optional_params_image_gen,
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index ce6f9b800..7fcd425bb 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -338,6 +338,18 @@
         "output_cost_per_second": 0.0001, 
         "litellm_provider": "azure"
     },
+    "azure/gpt-4-turbo-2024-04-09": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "supports_vision": true
+    },
     "azure/gpt-4-0125-preview": {
         "max_tokens": 4096,
         "max_input_tokens": 128000,
@@ -813,6 +825,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 264
     },
     "claude-3-opus-20240229": {
@@ -824,6 +837,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 395
     },
     "claude-3-sonnet-20240229": {
@@ -835,6 +849,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 159
     },
     "text-bison": {
@@ -1142,7 +1157,8 @@
         "output_cost_per_token": 0.000015,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "vertex_ai/claude-3-haiku@20240307": {
         "max_tokens": 4096, 
@@ -1152,7 +1168,8 @@
         "output_cost_per_token": 0.00000125,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "vertex_ai/claude-3-opus@20240229": {
         "max_tokens": 4096,
@@ -1162,7 +1179,8 @@
         "output_cost_per_token": 0.0000075,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "textembedding-gecko": {
         "max_tokens": 3072,
@@ -1581,6 +1599,7 @@
         "litellm_provider": "openrouter",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 395
     },
     "openrouter/google/palm-2-chat-bison": {
@@ -1929,7 +1948,8 @@
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-3-haiku-20240307-v1:0": {
         "max_tokens": 4096, 
@@ -1939,7 +1959,8 @@
         "output_cost_per_token": 0.00000125,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-3-opus-20240229-v1:0": {
         "max_tokens": 4096,
@@ -1949,7 +1970,8 @@
         "output_cost_per_token": 0.000075,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-v1": {
         "max_tokens": 8191, 
diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml
index 9f2f6ec17..d90fb13fd 100644
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@@ -11,5 +11,12 @@ router_settings:
   redis_password: os.environ/REDIS_PASSWORD
   redis_port: os.environ/REDIS_PORT
 
+router_settings:
+  routing_strategy: "latency-based-routing"
+
 litellm_settings:
-  success_callback: ["openmeter"]
\ No newline at end of file
+  success_callback: ["openmeter"]
+
+general_settings:
+  alerting: ["slack"]
+  alert_types: ["llm_exceptions"]
\ No newline at end of file
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 9cc871966..26987f478 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3446,172 +3446,6 @@ def model_list(
     )
 
 
-@router.post(
-    "/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
-)
-@router.post(
-    "/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
-)
-@router.post(
-    "/engines/{model:path}/completions",
-    dependencies=[Depends(user_api_key_auth)],
-    tags=["completions"],
-)
-@router.post(
-    "/openai/deployments/{model:path}/completions",
-    dependencies=[Depends(user_api_key_auth)],
-    tags=["completions"],
-)
-async def completion(
-    request: Request,
-    fastapi_response: Response,
-    model: Optional[str] = None,
-    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-):
-    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
-    try:
-        body = await request.body()
-        body_str = body.decode()
-        try:
-            data = ast.literal_eval(body_str)
-        except:
-            data = json.loads(body_str)
-
-        data["user"] = data.get("user", user_api_key_dict.user_id)
-        data["model"] = (
-            general_settings.get("completion_model", None)  # server default
-            or user_model  # model name passed via cli args
-            or model  # for azure deployments
-            or data["model"]  # default passed in http request
-        )
-        if user_model:
-            data["model"] = user_model
-        if "metadata" not in data:
-            data["metadata"] = {}
-        data["metadata"]["user_api_key"] = user_api_key_dict.api_key
-        data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
-        data["metadata"]["user_api_key_alias"] = getattr(
-            user_api_key_dict, "key_alias", None
-        )
-        data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
-        data["metadata"]["user_api_key_team_id"] = getattr(
-            user_api_key_dict, "team_id", None
-        )
-        data["metadata"]["user_api_key_team_alias"] = getattr(
-            user_api_key_dict, "team_alias", None
-        )
-        _headers = dict(request.headers)
-        _headers.pop(
-            "authorization", None
-        )  # do not store the original `sk-..` api key in the db
-        data["metadata"]["headers"] = _headers
-        data["metadata"]["endpoint"] = str(request.url)
-
-        # override with user settings, these are params passed via cli
-        if user_temperature:
-            data["temperature"] = user_temperature
-        if user_request_timeout:
-            data["request_timeout"] = user_request_timeout
-        if user_max_tokens:
-            data["max_tokens"] = user_max_tokens
-        if user_api_base:
-            data["api_base"] = user_api_base
-
-        ### MODEL ALIAS MAPPING ###
-        # check if model name in model alias map
-        # get the actual model name
-        if data["model"] in litellm.model_alias_map:
-            data["model"] = litellm.model_alias_map[data["model"]]
-
-        ### CALL HOOKS ### - modify incoming data before calling the model
-        data = await proxy_logging_obj.pre_call_hook(
-            user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
-        )
-
-        ### ROUTE THE REQUESTs ###
-        router_model_names = llm_router.model_names if llm_router is not None else []
-        # skip router if user passed their key
-        if "api_key" in data:
-            response = await litellm.atext_completion(**data)
-        elif (
-            llm_router is not None and data["model"] in router_model_names
-        ):  # model in router model list
-            response = await llm_router.atext_completion(**data)
-        elif (
-            llm_router is not None
-            and llm_router.model_group_alias is not None
-            and data["model"] in llm_router.model_group_alias
-        ):  # model set in model_group_alias
-            response = await llm_router.atext_completion(**data)
-        elif (
-            llm_router is not None and data["model"] in llm_router.deployment_names
-        ):  # model in router deployments, calling a specific deployment on the router
-            response = await llm_router.atext_completion(
-                **data, specific_deployment=True
-            )
-        elif (
-            llm_router is not None
-            and data["model"] not in router_model_names
-            and llm_router.default_deployment is not None
-        ):  # model in router deployments, calling a specific deployment on the router
-            response = await llm_router.atext_completion(**data)
-        elif user_model is not None:  # `litellm --model <your-model-name>`
-            response = await litellm.atext_completion(**data)
-        else:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail={
-                    "error": "Invalid model name passed in model="
-                    + data.get("model", "")
-                },
-            )
-
-        if hasattr(response, "_hidden_params"):
-            model_id = response._hidden_params.get("model_id", None) or ""
-            original_response = (
-                response._hidden_params.get("original_response", None) or ""
-            )
-        else:
-            model_id = ""
-            original_response = ""
-
-        verbose_proxy_logger.debug("final response: %s", response)
-        if (
-            "stream" in data and data["stream"] == True
-        ):  # use generate_responses to stream responses
-            custom_headers = {
-                "x-litellm-model-id": model_id,
-            }
-            selected_data_generator = select_data_generator(
-                response=response, user_api_key_dict=user_api_key_dict
-            )
-
-            return StreamingResponse(
-                selected_data_generator,
-                media_type="text/event-stream",
-                headers=custom_headers,
-            )
-
-        fastapi_response.headers["x-litellm-model-id"] = model_id
-        return response
-    except Exception as e:
-        data["litellm_status"] = "fail"  # used for alerting
-        verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY")
-        verbose_proxy_logger.debug(
-            "\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
-            e,
-        )
-        traceback.print_exc()
-        error_traceback = traceback.format_exc()
-        error_msg = f"{str(e)}"
-        raise ProxyException(
-            message=getattr(e, "message", error_msg),
-            type=getattr(e, "type", "None"),
-            param=getattr(e, "param", "None"),
-            code=getattr(e, "status_code", 500),
-        )
-
-
 @router.post(
     "/v1/chat/completions",
     dependencies=[Depends(user_api_key_auth)],
@@ -3810,7 +3644,7 @@ async def chat_completion(
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail={
-                    "error": "Invalid model name passed in model="
+                    "error": "chat_completion: Invalid model name passed in model="
                     + data.get("model", "")
                 },
             )
@@ -3884,6 +3718,172 @@ async def chat_completion(
         )
 
 
+@router.post(
+    "/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
+)
+@router.post(
+    "/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
+)
+@router.post(
+    "/engines/{model:path}/completions",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["completions"],
+)
+@router.post(
+    "/openai/deployments/{model:path}/completions",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["completions"],
+)
+async def completion(
+    request: Request,
+    fastapi_response: Response,
+    model: Optional[str] = None,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
+    try:
+        body = await request.body()
+        body_str = body.decode()
+        try:
+            data = ast.literal_eval(body_str)
+        except:
+            data = json.loads(body_str)
+
+        data["user"] = data.get("user", user_api_key_dict.user_id)
+        data["model"] = (
+            general_settings.get("completion_model", None)  # server default
+            or user_model  # model name passed via cli args
+            or model  # for azure deployments
+            or data["model"]  # default passed in http request
+        )
+        if user_model:
+            data["model"] = user_model
+        if "metadata" not in data:
+            data["metadata"] = {}
+        data["metadata"]["user_api_key"] = user_api_key_dict.api_key
+        data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
+        data["metadata"]["user_api_key_alias"] = getattr(
+            user_api_key_dict, "key_alias", None
+        )
+        data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
+        data["metadata"]["user_api_key_team_id"] = getattr(
+            user_api_key_dict, "team_id", None
+        )
+        data["metadata"]["user_api_key_team_alias"] = getattr(
+            user_api_key_dict, "team_alias", None
+        )
+        _headers = dict(request.headers)
+        _headers.pop(
+            "authorization", None
+        )  # do not store the original `sk-..` api key in the db
+        data["metadata"]["headers"] = _headers
+        data["metadata"]["endpoint"] = str(request.url)
+
+        # override with user settings, these are params passed via cli
+        if user_temperature:
+            data["temperature"] = user_temperature
+        if user_request_timeout:
+            data["request_timeout"] = user_request_timeout
+        if user_max_tokens:
+            data["max_tokens"] = user_max_tokens
+        if user_api_base:
+            data["api_base"] = user_api_base
+
+        ### MODEL ALIAS MAPPING ###
+        # check if model name in model alias map
+        # get the actual model name
+        if data["model"] in litellm.model_alias_map:
+            data["model"] = litellm.model_alias_map[data["model"]]
+
+        ### CALL HOOKS ### - modify incoming data before calling the model
+        data = await proxy_logging_obj.pre_call_hook(
+            user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
+        )
+
+        ### ROUTE THE REQUESTs ###
+        router_model_names = llm_router.model_names if llm_router is not None else []
+        # skip router if user passed their key
+        if "api_key" in data:
+            response = await litellm.atext_completion(**data)
+        elif (
+            llm_router is not None and data["model"] in router_model_names
+        ):  # model in router model list
+            response = await llm_router.atext_completion(**data)
+        elif (
+            llm_router is not None
+            and llm_router.model_group_alias is not None
+            and data["model"] in llm_router.model_group_alias
+        ):  # model set in model_group_alias
+            response = await llm_router.atext_completion(**data)
+        elif (
+            llm_router is not None and data["model"] in llm_router.deployment_names
+        ):  # model in router deployments, calling a specific deployment on the router
+            response = await llm_router.atext_completion(
+                **data, specific_deployment=True
+            )
+        elif (
+            llm_router is not None
+            and data["model"] not in router_model_names
+            and llm_router.default_deployment is not None
+        ):  # model in router deployments, calling a specific deployment on the router
+            response = await llm_router.atext_completion(**data)
+        elif user_model is not None:  # `litellm --model <your-model-name>`
+            response = await litellm.atext_completion(**data)
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail={
+                    "error": "completion: Invalid model name passed in model="
+                    + data.get("model", "")
+                },
+            )
+
+        if hasattr(response, "_hidden_params"):
+            model_id = response._hidden_params.get("model_id", None) or ""
+            original_response = (
+                response._hidden_params.get("original_response", None) or ""
+            )
+        else:
+            model_id = ""
+            original_response = ""
+
+        verbose_proxy_logger.debug("final response: %s", response)
+        if (
+            "stream" in data and data["stream"] == True
+        ):  # use generate_responses to stream responses
+            custom_headers = {
+                "x-litellm-model-id": model_id,
+            }
+            selected_data_generator = select_data_generator(
+                response=response, user_api_key_dict=user_api_key_dict
+            )
+
+            return StreamingResponse(
+                selected_data_generator,
+                media_type="text/event-stream",
+                headers=custom_headers,
+            )
+
+        fastapi_response.headers["x-litellm-model-id"] = model_id
+        return response
+    except Exception as e:
+        data["litellm_status"] = "fail"  # used for alerting
+        verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY")
+        verbose_proxy_logger.debug(
+            "\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
+            e,
+        )
+        traceback.print_exc()
+        error_traceback = traceback.format_exc()
+        error_msg = f"{str(e)}"
+        raise ProxyException(
+            message=getattr(e, "message", error_msg),
+            type=getattr(e, "type", "None"),
+            param=getattr(e, "param", "None"),
+            code=getattr(e, "status_code", 500),
+        )
+
+
 @router.post(
     "/v1/embeddings",
     dependencies=[Depends(user_api_key_auth)],
@@ -4041,7 +4041,7 @@ async def embeddings(
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail={
-                    "error": "Invalid model name passed in model="
+                    "error": "embeddings: Invalid model name passed in model="
                     + data.get("model", "")
                 },
             )
@@ -4197,7 +4197,7 @@ async def image_generation(
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail={
-                    "error": "Invalid model name passed in model="
+                    "error": "image_generation: Invalid model name passed in model="
                     + data.get("model", "")
                 },
             )
@@ -4372,7 +4372,7 @@ async def audio_transcriptions(
                     raise HTTPException(
                         status_code=status.HTTP_400_BAD_REQUEST,
                         detail={
-                            "error": "Invalid model name passed in model="
+                            "error": "audio_transcriptions: Invalid model name passed in model="
                             + data.get("model", "")
                         },
                     )
@@ -4538,7 +4538,7 @@ async def moderations(
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail={
-                    "error": "Invalid model name passed in model="
+                    "error": "moderations: Invalid model name passed in model="
                     + data.get("model", "")
                 },
             )
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index b5db81b31..1048c6727 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -387,15 +387,21 @@ class ProxyLogging:
         """
 
         ### ALERTING ###
-        if "llm_exceptions" not in self.alert_types:
-            return
-        asyncio.create_task(
-            self.alerting_handler(
-                message=f"LLM API call failed: {str(original_exception)}",
-                level="High",
-                alert_type="llm_exceptions",
+        if "llm_exceptions" in self.alert_types and not isinstance(
+            original_exception, HTTPException
+        ):
+            """
+            Just alert on LLM API exceptions. Do not alert on user errors
+
+            Related issue - https://github.com/BerriAI/litellm/issues/3395
+            """
+            asyncio.create_task(
+                self.alerting_handler(
+                    message=f"LLM API call failed: {str(original_exception)}",
+                    level="High",
+                    alert_type="llm_exceptions",
+                )
             )
-        )
 
         for callback in litellm.callbacks:
             try:
@@ -679,8 +685,8 @@ class PrismaClient:
     @backoff.on_exception(
         backoff.expo,
         Exception,  # base exception to catch for the backoff
-        max_tries=3,  # maximum number of retries
-        max_time=10,  # maximum total time to retry for
+        max_tries=1,  # maximum number of retries
+        max_time=2,  # maximum total time to retry for
         on_backoff=on_backoff,  # specifying the function to call on backoff
     )
     async def get_generic_data(
@@ -718,7 +724,8 @@ class PrismaClient:
             import traceback
 
             error_msg = f"LiteLLM Prisma Client Exception get_generic_data: {str(e)}"
-            print_verbose(error_msg)
+            verbose_proxy_logger.error(error_msg)
+            error_msg = error_msg + "\nException Type: {}".format(type(e))
             error_traceback = error_msg + "\n" + traceback.format_exc()
             end_time = time.time()
             _duration = end_time - start_time
diff --git a/litellm/router.py b/litellm/router.py
index 15fdbd4b8..7acf75e8e 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -2590,6 +2590,16 @@ class Router:
                     return model
         return None
 
+    def get_model_info(self, id: str) -> Optional[dict]:
+        """
+        For a given model id, return the model info
+        """
+        for model in self.model_list:
+            if "model_info" in model and "id" in model["model_info"]:
+                if id == model["model_info"]["id"]:
+                    return model
+        return None
+
     def get_model_ids(self):
         ids = []
         for model in self.model_list:
@@ -2904,15 +2914,10 @@ class Router:
                 m for m in self.model_list if m["litellm_params"]["model"] == model
             ]
 
-        verbose_router_logger.debug(
-            f"initial list of deployments: {healthy_deployments}"
-        )
+        litellm.print_verbose(f"initial list of deployments: {healthy_deployments}")
 
-        verbose_router_logger.debug(
-            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
-        )
         if len(healthy_deployments) == 0:
-            raise ValueError(f"No healthy deployment available, passed model={model}")
+            raise ValueError(f"No healthy deployment available, passed model={model}. ")
         if litellm.model_alias_map and model in litellm.model_alias_map:
             model = litellm.model_alias_map[
                 model
diff --git a/litellm/router_strategy/lowest_tpm_rpm_v2.py b/litellm/router_strategy/lowest_tpm_rpm_v2.py
index 4bcf1eec1..f7a55d970 100644
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@@ -79,10 +79,12 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                     model=deployment.get("litellm_params", {}).get("model"),
                     response=httpx.Response(
                         status_code=429,
-                        content="{} rpm limit={}. current usage={}".format(
+                        content="{} rpm limit={}. current usage={}. id={}, model_group={}. Get the model info by calling 'router.get_model_info(id)".format(
                             RouterErrors.user_defined_ratelimit_error.value,
                             deployment_rpm,
                             local_result,
+                            model_id,
+                            deployment.get("model_name", ""),
                         ),
                         request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
                     ),
diff --git a/litellm/tests/langfuse.log b/litellm/tests/langfuse.log
index e69de29bb..f47590a29 100644
--- a/litellm/tests/langfuse.log
+++ b/litellm/tests/langfuse.log
@@ -0,0 +1,88 @@
+int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+Traceback (most recent call last):
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
+    "usage": _convert_usage_input(usage) if usage is not None else None,
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
+    "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
+                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
+    return int(usage[key])
+           ^^^^^^^^^^^^^^^
+TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+Traceback (most recent call last):
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
+    "usage": _convert_usage_input(usage) if usage is not None else None,
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
+    "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
+                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
+    return int(usage[key])
+           ^^^^^^^^^^^^^^^
+TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+Traceback (most recent call last):
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
+    "usage": _convert_usage_input(usage) if usage is not None else None,
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
+    "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
+                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
+    return int(usage[key])
+           ^^^^^^^^^^^^^^^
+TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+Traceback (most recent call last):
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
+    "usage": _convert_usage_input(usage) if usage is not None else None,
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
+    "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
+                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
+    return int(usage[key])
+           ^^^^^^^^^^^^^^^
+TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+Traceback (most recent call last):
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
+    "usage": _convert_usage_input(usage) if usage is not None else None,
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
+    "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
+                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
+    return int(usage[key])
+           ^^^^^^^^^^^^^^^
+TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
+consumer is running...
+Getting observations... None, None, None, None, litellm-test-98e1cc75-bef8-4280-a2b9-e08633b81acd, None, GENERATION
+consumer is running...
+Getting observations... None, None, None, None, litellm-test-532d2bc8-f8d6-42fd-8f78-416bae79925d, None, GENERATION
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
+joining 1 consumer threads
+consumer thread 0 joined
diff --git a/litellm/tests/test_alangfuse.py b/litellm/tests/test_alangfuse.py
index 6c3830935..29718d474 100644
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@@ -205,8 +205,6 @@ async def test_langfuse_logging_without_request_response(stream):
         assert _trace_data[0].output == {
             "role": "assistant",
             "content": "redacted-by-litellm",
-            "function_call": None,
-            "tool_calls": None,
         }
 
     except Exception as e:
diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py
index ff3e8f8c7..a74e25910 100644
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@@ -3,7 +3,7 @@
 
 import sys
 import os
-import io, asyncio
+import io, asyncio, httpx
 from datetime import datetime, timedelta
 
 # import logging
@@ -17,6 +17,61 @@ import asyncio
 from unittest.mock import patch, MagicMock
 from litellm.caching import DualCache
 from litellm.integrations.slack_alerting import SlackAlerting
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy.proxy_server import HTTPException
+
+
+@pytest.mark.parametrize("exception_type", ["llm-exception", "non-llm-exception"])
+@pytest.mark.asyncio
+async def test_slack_alerting_llm_exceptions(exception_type, monkeypatch):
+    """
+    Test if non-llm exception -> No request
+    Test if llm exception -> Request triggered
+    """
+    _pl = ProxyLogging(user_api_key_cache=DualCache())
+    _pl.update_values(
+        alerting=["slack"],
+        alerting_threshold=100,
+        redis_cache=None,
+        alert_types=["llm_exceptions"],
+    )
+
+    async def mock_alerting_handler(message, level, alert_type):
+        global exception_type
+
+        if exception_type == "llm-exception":
+            pass
+        elif exception_type == "non-llm-exception":
+            pytest.fail("Function should not have been called")
+
+    monkeypatch.setattr(_pl, "alerting_handler", mock_alerting_handler)
+
+    if exception_type == "llm-exception":
+        await _pl.post_call_failure_hook(
+            original_exception=litellm.APIError(
+                status_code=500,
+                message="This is a test exception",
+                llm_provider="openai",
+                model="gpt-3.5-turbo",
+                request=httpx.Request(
+                    method="completion", url="https://github.com/BerriAI/litellm"
+                ),
+            ),
+            user_api_key_dict=UserAPIKeyAuth(),
+        )
+
+        await asyncio.sleep(2)
+
+    elif exception_type == "non-llm-exception":
+        await _pl.post_call_failure_hook(
+            original_exception=HTTPException(
+                status_code=400,
+                detail={"error": "this is a test exception"},
+            ),
+            user_api_key_dict=UserAPIKeyAuth(),
+        )
+
+        await asyncio.sleep(2)
 
 
 @pytest.mark.asyncio
diff --git a/litellm/tests/test_proxy_exception_mapping.py b/litellm/tests/test_proxy_exception_mapping.py
index 82957b658..0cc7b0d30 100644
--- a/litellm/tests/test_proxy_exception_mapping.py
+++ b/litellm/tests/test_proxy_exception_mapping.py
@@ -169,7 +169,7 @@ def test_chat_completion_exception_any_model(client):
         )
         assert isinstance(openai_exception, openai.BadRequestError)
         _error_message = openai_exception.message
-        assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
+        assert "chat_completion: Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
 
     except Exception as e:
         pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
@@ -197,7 +197,7 @@ def test_embedding_exception_any_model(client):
         print("Exception raised=", openai_exception)
         assert isinstance(openai_exception, openai.BadRequestError)
         _error_message = openai_exception.message
-        assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
+        assert "embeddings: Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
 
     except Exception as e:
         pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py
index 052646db8..43a070556 100644
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@@ -1,5 +1,6 @@
 import sys, os
 import traceback
+from unittest import mock
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -35,6 +36,77 @@ token = "sk-1234"
 
 headers = {"Authorization": f"Bearer {token}"}
 
+example_completion_result = {
+    "choices": [
+        {
+            "message": {
+                "content": "Whispers of the wind carry dreams to me.",
+                "role": "assistant"
+            }
+        }
+    ],
+}
+example_embedding_result = {
+  "object": "list",
+  "data": [
+    {
+      "object": "embedding",
+      "index": 0,
+      "embedding": [
+        -0.006929283495992422,
+        -0.005336422007530928,
+        -4.547132266452536e-05,
+        -0.024047505110502243,
+        -0.006929283495992422,
+        -0.005336422007530928,
+        -4.547132266452536e-05,
+        -0.024047505110502243,
+        -0.006929283495992422,
+        -0.005336422007530928,
+        -4.547132266452536e-05,
+        -0.024047505110502243,
+      ],
+    }
+  ],
+  "model": "text-embedding-3-small",
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 5
+  }
+}
+example_image_generation_result = {
+  "created": 1589478378,
+  "data": [
+    {
+      "url": "https://..."
+    },
+    {
+      "url": "https://..."
+    }
+  ]
+}
+
+
+def mock_patch_acompletion():
+    return mock.patch(
+        "litellm.proxy.proxy_server.llm_router.acompletion",
+        return_value=example_completion_result,
+    )
+
+
+def mock_patch_aembedding():
+    return mock.patch(
+        "litellm.proxy.proxy_server.llm_router.aembedding",
+        return_value=example_embedding_result,
+    )
+
+
+def mock_patch_aimage_generation():
+    return mock.patch(
+        "litellm.proxy.proxy_server.llm_router.aimage_generation",
+        return_value=example_image_generation_result,
+    )
+
 
 @pytest.fixture(scope="function")
 def client_no_auth():
@@ -52,7 +124,8 @@ def client_no_auth():
     return TestClient(app)
 
 
-def test_chat_completion(client_no_auth):
+@mock_patch_acompletion()
+def test_chat_completion(mock_acompletion, client_no_auth):
     global headers
     try:
         # Your test data
@@ -66,6 +139,19 @@ def test_chat_completion(client_no_auth):
 
         print("testing proxy server with chat completions")
         response = client_no_auth.post("/v1/chat/completions", json=test_data)
+        mock_acompletion.assert_called_once_with(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "user", "content": "hi"},
+            ],
+            max_tokens=10,
+            litellm_call_id=mock.ANY,
+            litellm_logging_obj=mock.ANY,
+            request_timeout=mock.ANY,
+            specific_deployment=True,
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         print(f"response - {response.text}")
         assert response.status_code == 200
         result = response.json()
@@ -77,7 +163,8 @@ def test_chat_completion(client_no_auth):
 # Run the test
 
 
-def test_chat_completion_azure(client_no_auth):
+@mock_patch_acompletion()
+def test_chat_completion_azure(mock_acompletion, client_no_auth):
     global headers
     try:
         # Your test data
@@ -92,6 +179,19 @@ def test_chat_completion_azure(client_no_auth):
         print("testing proxy server with Azure Request /chat/completions")
         response = client_no_auth.post("/v1/chat/completions", json=test_data)
 
+        mock_acompletion.assert_called_once_with(
+            model="azure/chatgpt-v-2",
+            messages=[
+                {"role": "user", "content": "write 1 sentence poem"},
+            ],
+            max_tokens=10,
+            litellm_call_id=mock.ANY,
+            litellm_logging_obj=mock.ANY,
+            request_timeout=mock.ANY,
+            specific_deployment=True,
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         assert response.status_code == 200
         result = response.json()
         print(f"Received response: {result}")
@@ -104,8 +204,51 @@ def test_chat_completion_azure(client_no_auth):
 # test_chat_completion_azure()
 
 
+@mock_patch_acompletion()
+def test_openai_deployments_model_chat_completions_azure(mock_acompletion, client_no_auth):
+    global headers
+    try:
+        # Your test data
+        test_data = {
+            "model": "azure/chatgpt-v-2",
+            "messages": [
+                {"role": "user", "content": "write 1 sentence poem"},
+            ],
+            "max_tokens": 10,
+        }
+
+        url = "/openai/deployments/azure/chatgpt-v-2/chat/completions"
+        print(f"testing proxy server with Azure Request {url}")
+        response = client_no_auth.post(url, json=test_data)
+
+        mock_acompletion.assert_called_once_with(
+            model="azure/chatgpt-v-2",
+            messages=[
+                {"role": "user", "content": "write 1 sentence poem"},
+            ],
+            max_tokens=10,
+            litellm_call_id=mock.ANY,
+            litellm_logging_obj=mock.ANY,
+            request_timeout=mock.ANY,
+            specific_deployment=True,
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
+        assert response.status_code == 200
+        result = response.json()
+        print(f"Received response: {result}")
+        assert len(result["choices"][0]["message"]["content"]) > 0
+    except Exception as e:
+        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
+
+
+# Run the test
+# test_openai_deployments_model_chat_completions_azure()
+
+
 ### EMBEDDING
-def test_embedding(client_no_auth):
+@mock_patch_aembedding()
+def test_embedding(mock_aembedding, client_no_auth):
     global headers
     from litellm.proxy.proxy_server import user_custom_auth
 
@@ -117,6 +260,13 @@ def test_embedding(client_no_auth):
 
         response = client_no_auth.post("/v1/embeddings", json=test_data)
 
+        mock_aembedding.assert_called_once_with(
+            model="azure/azure-embedding-model",
+            input=["good morning from litellm"],
+            specific_deployment=True,
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         assert response.status_code == 200
         result = response.json()
         print(len(result["data"][0]["embedding"]))
@@ -125,7 +275,8 @@ def test_embedding(client_no_auth):
         pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
 
 
-def test_bedrock_embedding(client_no_auth):
+@mock_patch_aembedding()
+def test_bedrock_embedding(mock_aembedding, client_no_auth):
     global headers
     from litellm.proxy.proxy_server import user_custom_auth
 
@@ -137,6 +288,12 @@ def test_bedrock_embedding(client_no_auth):
 
         response = client_no_auth.post("/v1/embeddings", json=test_data)
 
+        mock_aembedding.assert_called_once_with(
+            model="amazon-embeddings",
+            input=["good morning from litellm"],
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         assert response.status_code == 200
         result = response.json()
         print(len(result["data"][0]["embedding"]))
@@ -171,7 +328,8 @@ def test_sagemaker_embedding(client_no_auth):
 #### IMAGE GENERATION
 
 
-def test_img_gen(client_no_auth):
+@mock_patch_aimage_generation()
+def test_img_gen(mock_aimage_generation, client_no_auth):
     global headers
     from litellm.proxy.proxy_server import user_custom_auth
 
@@ -185,6 +343,14 @@ def test_img_gen(client_no_auth):
 
         response = client_no_auth.post("/v1/images/generations", json=test_data)
 
+        mock_aimage_generation.assert_called_once_with(
+            model='dall-e-3',
+            prompt='A cute baby sea otter',
+            n=1,
+            size='1024x1024',
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         assert response.status_code == 200
         result = response.json()
         print(len(result["data"][0]["url"]))
@@ -249,7 +415,8 @@ class MyCustomHandler(CustomLogger):
 customHandler = MyCustomHandler()
 
 
-def test_chat_completion_optional_params(client_no_auth):
+@mock_patch_acompletion()
+def test_chat_completion_optional_params(mock_acompletion, client_no_auth):
     # [PROXY: PROD TEST] - DO NOT DELETE
     # This tests if all the /chat/completion params are passed to litellm
     try:
@@ -267,6 +434,20 @@ def test_chat_completion_optional_params(client_no_auth):
         litellm.callbacks = [customHandler]
         print("testing proxy server: optional params")
         response = client_no_auth.post("/v1/chat/completions", json=test_data)
+        mock_acompletion.assert_called_once_with(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "user", "content": "hi"},
+            ],
+            max_tokens=10,
+            user="proxy-user",
+            litellm_call_id=mock.ANY,
+            litellm_logging_obj=mock.ANY,
+            request_timeout=mock.ANY,
+            specific_deployment=True,
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
         assert response.status_code == 200
         result = response.json()
         print(f"Received response: {result}")
diff --git a/litellm/tests/test_token_counter.py b/litellm/tests/test_token_counter.py
index af0db487e..4d759d4cf 100644
--- a/litellm/tests/test_token_counter.py
+++ b/litellm/tests/test_token_counter.py
@@ -9,7 +9,7 @@ sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import time
-from litellm import token_counter, encode, decode
+from litellm import token_counter, create_pretrained_tokenizer, encode, decode
 
 
 def test_token_counter_normal_plus_function_calling():
@@ -69,15 +69,23 @@ def test_tokenizers():
             model="meta-llama/Llama-2-7b-chat", text=sample_text
         )
 
+        # llama3 tokenizer (also testing custom tokenizer)
+        llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text)
+
+        llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
+        llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text)
+
         print(
-            f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}"
+            f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}"
         )
 
         # assert that all token values are different
         assert (
-            openai_tokens != cohere_tokens != llama2_tokens
+            openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1
         ), "Token values are not different."
 
+        assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
+
         print("test tokenizer: It worked!")
     except Exception as e:
         pytest.fail(f"An exception occured: {e}")
diff --git a/litellm/tests/test_utils.py b/litellm/tests/test_utils.py
index 44fb1607c..57b93df9c 100644
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@@ -20,6 +20,8 @@ from litellm.utils import (
     validate_environment,
     function_to_dict,
     token_counter,
+    create_pretrained_tokenizer,
+    create_tokenizer,
 )
 
 # Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils'
diff --git a/litellm/utils.py b/litellm/utils.py
index c4117bdb3..ec296e9dc 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -378,16 +378,13 @@ class Message(OpenAIObject):
         super(Message, self).__init__(**params)
         self.content = content
         self.role = role
-        self.tool_calls = None
-        self.function_call = None
-
         if function_call is not None:
             self.function_call = FunctionCall(**function_call)
 
         if tool_calls is not None:
-            self.tool_calls = [
-                ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
-            ]
+            self.tool_calls = []
+            for tool_call in tool_calls:
+                self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
 
         if logprobs is not None:
             self._logprobs = ChoiceLogprobs(**logprobs)
@@ -413,8 +410,6 @@ class Message(OpenAIObject):
 
 
 class Delta(OpenAIObject):
-    tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
-
     def __init__(
         self,
         content=None,
@@ -1700,10 +1695,17 @@ class Logging:
                                 print_verbose("reaches langfuse for streaming logging!")
                                 result = kwargs["complete_streaming_response"]
                         if langFuseLogger is None or (
-                            self.langfuse_public_key != langFuseLogger.public_key
-                            and self.langfuse_secret != langFuseLogger.secret_key
+                            (
+                                self.langfuse_public_key is not None
+                                and self.langfuse_public_key
+                                != langFuseLogger.public_key
+                            )
+                            and (
+                                self.langfuse_public_key is not None
+                                and self.langfuse_public_key
+                                != langFuseLogger.public_key
+                            )
                         ):
-                            print_verbose("Instantiates langfuse client")
                             langFuseLogger = LangFuseLogger(
                                 langfuse_public_key=self.langfuse_public_key,
                                 langfuse_secret=self.langfuse_secret,
@@ -3773,29 +3775,34 @@ def _select_tokenizer(model: str):
     elif "llama-2" in model.lower() or "replicate" in model.lower():
         tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
         return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
+    # llama3
+    elif "llama-3" in model.lower():
+        tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
+        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
     # default - tiktoken
     else:
         return {"type": "openai_tokenizer", "tokenizer": encoding}
 
 
-def encode(model: str, text: str):
+def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
     """
     Encodes the given text using the specified model.
 
     Args:
         model (str): The name of the model to use for tokenization.
+        custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None.
         text (str): The text to be encoded.
 
     Returns:
         enc: The encoded text.
     """
-    tokenizer_json = _select_tokenizer(model=model)
+    tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
     enc = tokenizer_json["tokenizer"].encode(text)
     return enc
 
 
-def decode(model: str, tokens: List[int]):
-    tokenizer_json = _select_tokenizer(model=model)
+def decode(model="", tokens: List[int] = [], custom_tokenizer: Optional[dict] = None):
+    tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
     dec = tokenizer_json["tokenizer"].decode(tokens)
     return dec
 
@@ -3967,10 +3974,47 @@ def calculage_img_tokens(
         tile_tokens = (base_tokens * 2) * tiles_needed_high_res
         total_tokens = base_tokens + tile_tokens
         return total_tokens
+    
+
+def create_pretrained_tokenizer(
+    identifier: str, 
+    revision="main", 
+    auth_token: Optional[str] = None
+):
+    """
+    Creates a tokenizer from an existing file on a HuggingFace repository to be used with `token_counter`.
+
+    Args:
+    identifier (str): The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
+    revision (str, defaults to main): A branch or commit id
+    auth_token (str, optional, defaults to None): An optional auth token used to access private repositories on the Hugging Face Hub
+
+    Returns:
+    dict: A dictionary with the tokenizer and its type.
+    """
+
+    tokenizer = Tokenizer.from_pretrained(identifier, revision=revision, auth_token=auth_token)
+    return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
+
+
+def create_tokenizer(json: str):
+    """
+    Creates a tokenizer from a valid JSON string for use with `token_counter`.
+
+    Args:
+    json (str): A valid JSON string representing a previously serialized tokenizer
+
+    Returns:
+    dict: A dictionary with the tokenizer and its type.
+    """
+
+    tokenizer = Tokenizer.from_str(json)
+    return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
 
 
 def token_counter(
     model="",
+    custom_tokenizer: Optional[dict] = None,
     text: Optional[Union[str, List[str]]] = None,
     messages: Optional[List] = None,
     count_response_tokens: Optional[bool] = False,
@@ -3980,13 +4024,14 @@ def token_counter(
 
     Args:
     model (str): The name of the model to use for tokenization. Default is an empty string.
+    custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None.
     text (str): The raw text string to be passed to the model. Default is None.
     messages (Optional[List[Dict[str, str]]]): Alternative to passing in text. A list of dictionaries representing messages with "role" and "content" keys. Default is None.
 
     Returns:
     int: The number of tokens in the text.
     """
-    # use tiktoken, anthropic, cohere or llama2's tokenizer depending on the model
+    # use tiktoken, anthropic, cohere, llama2, or llama3's tokenizer depending on the model
     is_tool_call = False
     num_tokens = 0
     if text == None:
@@ -4028,8 +4073,8 @@ def token_counter(
     elif isinstance(text, str):
         count_response_tokens = True  # user just trying to count tokens for a text. don't add the chat_ml +3 tokens to this
 
-    if model is not None:
-        tokenizer_json = _select_tokenizer(model=model)
+    if model is not None or custom_tokenizer is not None:
+        tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
         if tokenizer_json["type"] == "huggingface_tokenizer":
             print_verbose(
                 f"Token Counter - using hugging face token counter, for model={model}"
@@ -6768,7 +6813,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
                 keys_in_environment = True
             else:
                 missing_keys.append("NLP_CLOUD_API_KEY")
-        elif custom_llm_provider == "bedrock":
+        elif custom_llm_provider == "bedrock" or custom_llm_provider == "sagemaker":
             if (
                 "AWS_ACCESS_KEY_ID" in os.environ
                 and "AWS_SECRET_ACCESS_KEY" in os.environ
@@ -6782,11 +6827,72 @@ def validate_environment(model: Optional[str] = None) -> dict:
                 keys_in_environment = True
             else:
                 missing_keys.append("OLLAMA_API_BASE")
+        elif custom_llm_provider == "anyscale":
+            if "ANYSCALE_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("ANYSCALE_API_KEY")
+        elif custom_llm_provider == "deepinfra":
+            if "DEEPINFRA_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("DEEPINFRA_API_KEY")
+        elif custom_llm_provider == "gemini":
+            if "GEMINI_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("GEMINI_API_KEY")
+        elif custom_llm_provider == "groq":
+            if "GROQ_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("GROQ_API_KEY")
+        elif custom_llm_provider == "mistral":
+            if "MISTRAL_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("MISTRAL_API_KEY")
+        elif custom_llm_provider == "palm":
+            if "PALM_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("PALM_API_KEY")
+        elif custom_llm_provider == "perplexity":
+            if "PERPLEXITYAI_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("PERPLEXITYAI_API_KEY")
+        elif custom_llm_provider == "voyage":
+            if "VOYAGE_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("VOYAGE_API_KEY")
+        elif custom_llm_provider == "fireworks_ai":
+            if (
+                "FIREWORKS_AI_API_KEY" in os.environ
+                or "FIREWORKS_API_KEY" in os.environ
+                or "FIREWORKSAI_API_KEY" in os.environ
+                or "FIREWORKS_AI_TOKEN" in os.environ
+            ):
+                keys_in_environment = True
+            else:
+                missing_keys.append("FIREWORKS_AI_API_KEY")
+        elif custom_llm_provider == "cloudflare":
+            if "CLOUDFLARE_API_KEY" in os.environ and (
+                "CLOUDFLARE_ACCOUNT_ID" in os.environ
+                or "CLOUDFLARE_API_BASE" in os.environ
+            ):
+                keys_in_environment = True
+            else:
+                missing_keys.append("CLOUDFLARE_API_KEY")
+                missing_keys.append("CLOUDFLARE_API_BASE")
     else:
         ## openai - chatcompletion + text completion
         if (
             model in litellm.open_ai_chat_completion_models
             or model in litellm.open_ai_text_completion_models
+            or model in litellm.open_ai_embedding_models
+            or model in litellm.openai_image_generation_models
         ):
             if "OPENAI_API_KEY" in os.environ:
                 keys_in_environment = True
@@ -6817,7 +6923,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
             else:
                 missing_keys.append("OPENROUTER_API_KEY")
         ## vertex - text + chat models
-        elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models:
+        elif (
+            model in litellm.vertex_chat_models
+            or model in litellm.vertex_text_models
+            or model in litellm.models_by_provider["vertex_ai"]
+        ):
             if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
                 keys_in_environment = True
             else:
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index ce6f9b800..7fcd425bb 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -338,6 +338,18 @@
         "output_cost_per_second": 0.0001, 
         "litellm_provider": "azure"
     },
+    "azure/gpt-4-turbo-2024-04-09": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "supports_vision": true
+    },
     "azure/gpt-4-0125-preview": {
         "max_tokens": 4096,
         "max_input_tokens": 128000,
@@ -813,6 +825,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 264
     },
     "claude-3-opus-20240229": {
@@ -824,6 +837,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 395
     },
     "claude-3-sonnet-20240229": {
@@ -835,6 +849,7 @@
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 159
     },
     "text-bison": {
@@ -1142,7 +1157,8 @@
         "output_cost_per_token": 0.000015,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "vertex_ai/claude-3-haiku@20240307": {
         "max_tokens": 4096, 
@@ -1152,7 +1168,8 @@
         "output_cost_per_token": 0.00000125,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "vertex_ai/claude-3-opus@20240229": {
         "max_tokens": 4096,
@@ -1162,7 +1179,8 @@
         "output_cost_per_token": 0.0000075,
         "litellm_provider": "vertex_ai-anthropic_models",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "textembedding-gecko": {
         "max_tokens": 3072,
@@ -1581,6 +1599,7 @@
         "litellm_provider": "openrouter",
         "mode": "chat",
         "supports_function_calling": true,
+        "supports_vision": true,
         "tool_use_system_prompt_tokens": 395
     },
     "openrouter/google/palm-2-chat-bison": {
@@ -1929,7 +1948,8 @@
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-3-haiku-20240307-v1:0": {
         "max_tokens": 4096, 
@@ -1939,7 +1959,8 @@
         "output_cost_per_token": 0.00000125,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-3-opus-20240229-v1:0": {
         "max_tokens": 4096,
@@ -1949,7 +1970,8 @@
         "output_cost_per_token": 0.000075,
         "litellm_provider": "bedrock",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_vision": true
     },
     "anthropic.claude-v1": {
         "max_tokens": 8191, 
diff --git a/poetry.lock b/poetry.lock
index 817a7e968..d699425e6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1153,13 +1153,13 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t
 
 [[package]]
 name = "idna"
-version = "3.6"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"},
-    {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
 [[package]]