diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a33473b72..d429bc6b8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
 -   repo: local
     hooks:
-    -   id: mypy
-        name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
-        language: system
-        types: [python]
-        files: ^litellm/
+    # -   id: mypy
+    #     name: mypy
+    #     entry: python3 -m mypy --ignore-missing-imports
+    #     language: system
+    #     types: [python]
+    #     files: ^litellm/
     -   id: isort
         name: isort
         entry: isort
diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md
index 82a7c37db..9492920d0 100644
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@@ -1426,6 +1426,7 @@ litellm_settings:
 ```shell
 DD_API_KEY="5f2d0f310***********" # your datadog API Key
 DD_SITE="us5.datadoghq.com"       # your datadog base url
+DD_SOURCE="litellm_dev"       # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments
 ```
 
 **Step 3**: Start the proxy, make a test request
diff --git a/litellm/caching.py b/litellm/caching.py
index db2f93507..13da3cb1e 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -2039,10 +2039,7 @@ class DualCache(BaseCache):
 
             return result
         except Exception as e:
-            verbose_logger.exception(
-                f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
-            )
-            raise e
+            raise e  # don't log if exception is raised
 
     async def async_set_cache_sadd(
         self, key, value: List, local_only: bool = False, **kwargs
@@ -2069,10 +2066,7 @@ class DualCache(BaseCache):
 
             return None
         except Exception as e:
-            verbose_logger.exception(
-                "LiteLLM Cache: Excepton async set_cache_sadd: {}".format(str(e))
-            )
-            raise e
+            raise e  # don't log, if exception is raised
 
     def flush_cache(self):
         if self.in_memory_cache is not None:
@@ -2543,7 +2537,6 @@ class Cache:
             self.cache.set_cache(cache_key, cached_data, **kwargs)
         except Exception as e:
             verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
-            pass
 
     async def async_add_cache(self, result, *args, **kwargs):
         """
diff --git a/litellm/integrations/braintrust_logging.py b/litellm/integrations/braintrust_logging.py
index 3e1c429de..5128fcc49 100644
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@@ -235,10 +235,7 @@ class BraintrustLogger(CustomLogger):
             except httpx.HTTPStatusError as e:
                 raise Exception(e.response.text)
         except Exception as e:
-            verbose_logger.exception(
-                "Error logging to braintrust - Exception received - {}".format(str(e))
-            )
-            raise e
+            raise e  # don't use verbose_logger.exception, if exception is raised
 
     async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
         verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
@@ -360,10 +357,7 @@ class BraintrustLogger(CustomLogger):
             except httpx.HTTPStatusError as e:
                 raise Exception(e.response.text)
         except Exception as e:
-            verbose_logger.exception(
-                "Error logging to braintrust - Exception received - {}".format(str(e))
-            )
-            raise e
+            raise e  # don't use verbose_logger.exception, if exception is raised
 
     def log_failure_event(self, kwargs, response_obj, start_time, end_time):
         return super().log_failure_event(kwargs, response_obj, start_time, end_time)
diff --git a/litellm/integrations/datadog.py b/litellm/integrations/datadog.py
index f3170e446..e6ff29e04 100644
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@@ -1,11 +1,17 @@
 #### What this does ####
 #    On success + failure, log events to Datadog
 
-import dotenv, os
-import requests  # type: ignore
+import datetime
+import os
+import subprocess
+import sys
 import traceback
-import datetime, subprocess, sys
-import litellm, uuid
+import uuid
+
+import dotenv
+import requests  # type: ignore
+
+import litellm
 from litellm._logging import print_verbose, verbose_logger
 
 
@@ -57,9 +63,9 @@ class DataDogLogger:
     ):
         try:
             # Define DataDog client
-            from datadog_api_client.v2.api.logs_api import LogsApi
             from datadog_api_client.v2 import ApiClient
-            from datadog_api_client.v2.models import HTTPLogItem, HTTPLog
+            from datadog_api_client.v2.api.logs_api import LogsApi
+            from datadog_api_client.v2.models import HTTPLog, HTTPLogItem
 
             verbose_logger.debug(
                 f"datadog Logging - Enters logging function for model {kwargs}"
@@ -131,7 +137,7 @@ class DataDogLogger:
                 body = HTTPLog(
                     [
                         HTTPLogItem(
-                            ddsource="litellm",
+                            ddsource=os.getenv("DD_SOURCE", "litellm"),
                             message=payload,
                             service="litellm-server",
                         ),
diff --git a/litellm/llms/anthropic/chat.py b/litellm/llms/anthropic/chat.py
index c3ad03859..dd7ab58c1 100644
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@@ -228,6 +228,54 @@ class AnthropicConfig:
 
         return False
 
+    def translate_system_message(
+        self, messages: List[AllMessageValues]
+    ) -> List[AnthropicSystemMessageContent]:
+        system_prompt_indices = []
+        anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
+        for idx, message in enumerate(messages):
+            if message["role"] == "system":
+                valid_content: bool = False
+                system_message_block = ChatCompletionSystemMessage(**message)
+                if isinstance(system_message_block["content"], str):
+                    anthropic_system_message_content = AnthropicSystemMessageContent(
+                        type="text",
+                        text=system_message_block["content"],
+                    )
+                    if "cache_control" in system_message_block:
+                        anthropic_system_message_content["cache_control"] = (
+                            system_message_block["cache_control"]
+                        )
+                    anthropic_system_message_list.append(
+                        anthropic_system_message_content
+                    )
+                    valid_content = True
+                elif isinstance(message["content"], list):
+                    for _content in message["content"]:
+                        anthropic_system_message_content = (
+                            AnthropicSystemMessageContent(
+                                type=_content.get("type"),
+                                text=_content.get("text"),
+                            )
+                        )
+                        if "cache_control" in _content:
+                            anthropic_system_message_content["cache_control"] = (
+                                _content["cache_control"]
+                            )
+
+                        anthropic_system_message_list.append(
+                            anthropic_system_message_content
+                        )
+                    valid_content = True
+
+                if valid_content:
+                    system_prompt_indices.append(idx)
+        if len(system_prompt_indices) > 0:
+            for idx in reversed(system_prompt_indices):
+                messages.pop(idx)
+
+        return anthropic_system_message_list
+
     ### FOR [BETA] `/v1/messages` endpoint support
 
     def translatable_anthropic_params(self) -> List:
@@ -314,7 +362,7 @@ class AnthropicConfig:
                 new_messages.append(user_message)
 
             if len(new_user_content_list) > 0:
-                new_messages.append({"role": "user", "content": new_user_content_list})
+                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
 
             if len(tool_message_list) > 0:
                 new_messages.extend(tool_message_list)
@@ -940,45 +988,11 @@ class AnthropicChatCompletion(BaseLLM):
             )
         else:
             # Separate system prompt from rest of message
-            system_prompt_indices = []
-            system_prompt = ""
-            anthropic_system_message_list = None
-            for idx, message in enumerate(messages):
-                if message["role"] == "system":
-                    valid_content: bool = False
-                    if isinstance(message["content"], str):
-                        system_prompt += message["content"]
-                        valid_content = True
-                    elif isinstance(message["content"], list):
-                        for _content in message["content"]:
-                            anthropic_system_message_content = (
-                                AnthropicSystemMessageContent(
-                                    type=_content.get("type"),
-                                    text=_content.get("text"),
-                                )
-                            )
-                            if "cache_control" in _content:
-                                anthropic_system_message_content["cache_control"] = (
-                                    _content["cache_control"]
-                                )
-
-                            if anthropic_system_message_list is None:
-                                anthropic_system_message_list = []
-                            anthropic_system_message_list.append(
-                                anthropic_system_message_content
-                            )
-                        valid_content = True
-
-                    if valid_content:
-                        system_prompt_indices.append(idx)
-            if len(system_prompt_indices) > 0:
-                for idx in reversed(system_prompt_indices):
-                    messages.pop(idx)
-            if len(system_prompt) > 0:
-                optional_params["system"] = system_prompt
-
+            anthropic_system_message_list = AnthropicConfig().translate_system_message(
+                messages=messages
+            )
             # Handling anthropic API Prompt Caching
-            if anthropic_system_message_list is not None:
+            if len(anthropic_system_message_list) > 0:
                 optional_params["system"] = anthropic_system_message_list
             # Format rest of message according to anthropic guidelines
             try:
@@ -986,15 +1000,10 @@ class AnthropicChatCompletion(BaseLLM):
                     model=model, messages=messages, custom_llm_provider="anthropic"
                 )
             except Exception as e:
-                verbose_logger.exception(
-                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
-                        str(e), messages
-                    )
-                )
                 raise AnthropicError(
                     status_code=400,
                     message="{}\nReceived Messages={}".format(str(e), messages),
-                )
+                )  # don't use verbose_logger.exception, if exception is raised
 
         ## Load Config
         config = litellm.AnthropicConfig.get_config()
diff --git a/litellm/llms/base_aws_llm.py b/litellm/llms/base_aws_llm.py
index 7449dc2d7..70f333eb6 100644
--- a/litellm/llms/base_aws_llm.py
+++ b/litellm/llms/base_aws_llm.py
@@ -119,8 +119,6 @@ class BaseAWSLLM(BaseLLM):
                     "aws_web_identity_token": aws_web_identity_token,
                     "aws_role_name": aws_role_name,
                     "aws_session_name": aws_session_name,
-                    "aws_region_name": aws_region_name,
-                    "aws_sts_endpoint": sts_endpoint,
                 }
             )
 
@@ -147,6 +145,7 @@ class BaseAWSLLM(BaseLLM):
                     RoleSessionName=aws_session_name,
                     WebIdentityToken=oidc_token,
                     DurationSeconds=3600,
+                    Policy='{"Version":"2012-10-17","Statement":[{"Sid":"BedrockLiteLLM","Effect":"Allow","Action":["bedrock:InvokeModel","bedrock:InvokeModelWithResponseStream"],"Resource":"*","Condition":{"Bool":{"aws:SecureTransport":"true"},"StringLike":{"aws:UserAgent":"litellm/*"}}}]}',
                 )
 
                 iam_creds_dict = {
@@ -164,6 +163,11 @@ class BaseAWSLLM(BaseLLM):
                     ttl=3600 - 60,
                 )
 
+                if sts_response["PackedPolicySize"] > 75:
+                    verbose_logger.warning(
+                        f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
+                    )
+
             session = boto3.Session(**iam_creds_dict)
 
             iam_creds = session.get_credentials()
diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index 9f62bab20..70678af64 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -423,13 +423,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                 async for transformed_chunk in streamwrapper:
                     yield transformed_chunk
     except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama.py::ollama_async_streaming(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised
 
 
 async def ollama_acompletion(
@@ -498,12 +492,7 @@ async def ollama_acompletion(
             )
             return model_response
     except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama.py::ollama_acompletion(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised
 
 
 async def ollama_aembeddings(
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 7c4cf7b37..2fc44f9cd 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -583,8 +583,4 @@ async def ollama_acompletion(
             )
             return model_response
     except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama_acompletion(): Exception occured - {}".format(str(e))
-        )
-
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised
diff --git a/litellm/llms/palm.py b/litellm/llms/palm.py
index a17fd02be..f6297e627 100644
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@@ -168,9 +168,6 @@ def completion(
             choices_list.append(choice_obj)
         model_response.choices = choices_list  # type: ignore
     except Exception as e:
-        verbose_logger.exception(
-            "litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
-        )
         raise PalmError(
             message=traceback.format_exc(), status_code=response.status_code
         )
diff --git a/litellm/llms/predibase.py b/litellm/llms/predibase.py
index 84e2810a5..81e28934d 100644
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@@ -564,12 +564,9 @@ class PredibaseChatCompletion(BaseLLM):
             for exception in litellm.LITELLM_EXCEPTION_TYPES:
                 if isinstance(e, exception):
                     raise e
-            verbose_logger.exception(
-                "litellm.llms.predibase.py::async_completion() - Exception occurred - {}".format(
-                    str(e)
-                )
-            )
-            raise PredibaseError(status_code=500, message="{}".format(str(e)))
+            raise PredibaseError(
+                status_code=500, message="{}".format(str(e))
+            )  # don't use verbose_logger.exception, if exception is raised
         return self.process_response(
             model=model,
             response=response,
diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index a1894d87f..d2b9db037 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -27,10 +27,13 @@ from litellm.types.completion import (
 from litellm.types.llms.anthropic import *
 from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
 from litellm.types.llms.openai import (
+    AllMessageValues,
     ChatCompletionAssistantMessage,
+    ChatCompletionAssistantToolCall,
     ChatCompletionFunctionMessage,
     ChatCompletionToolCallFunctionChunk,
     ChatCompletionToolMessage,
+    ChatCompletionUserMessage,
 )
 from litellm.types.utils import GenericImageParsingChunk
 
@@ -493,10 +496,9 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =
 
         return rendered_text
     except Exception as e:
-        verbose_logger.exception(
-            "Error rendering huggingface chat template - {}".format(str(e))
-        )
-        raise Exception(f"Error rendering template - {str(e)}")
+        raise Exception(
+            f"Error rendering template - {str(e)}"
+        )  # don't use verbose_logger.exception, if exception is raised
 
 
 # Anthropic template
@@ -1171,7 +1173,9 @@ def convert_to_gemini_tool_call_result(
     return _part
 
 
-def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResultParam:
+def convert_to_anthropic_tool_result(
+    message: Union[dict, ChatCompletionToolMessage, ChatCompletionFunctionMessage]
+) -> AnthropicMessagesToolResultParam:
     """
     OpenAI message with a tool result looks like:
     {
@@ -1215,7 +1219,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
         return anthropic_tool_result
     if message["role"] == "function":
         content = message.get("content")  # type: ignore
-        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())
+        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())  # type: ignore
         anthropic_tool_result = AnthropicMessagesToolResultParam(
             type="tool_result", tool_use_id=tool_call_id, content=content
         )
@@ -1230,7 +1234,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
 
 
 def convert_function_to_anthropic_tool_invoke(
-    function_call,
+    function_call: Union[dict, ChatCompletionToolCallFunctionChunk],
 ) -> List[AnthropicMessagesToolUseParam]:
     try:
         anthropic_tool_invoke = [
@@ -1247,7 +1251,7 @@ def convert_function_to_anthropic_tool_invoke(
 
 
 def convert_to_anthropic_tool_invoke(
-    tool_calls: list,
+    tool_calls: List[ChatCompletionAssistantToolCall],
 ) -> List[AnthropicMessagesToolUseParam]:
     """
     OpenAI tool invokes:
@@ -1307,17 +1311,19 @@ def add_cache_control_to_content(
     anthropic_content_element: Union[
         dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
     ],
-    orignal_content_element: dict,
+    orignal_content_element: Union[dict, AllMessageValues],
 ):
-    if "cache_control" in orignal_content_element:
-        anthropic_content_element["cache_control"] = orignal_content_element[
-            "cache_control"
-        ]
+    cache_control_param = orignal_content_element.get("cache_control")
+    if cache_control_param is not None and isinstance(cache_control_param, dict):
+        transformed_param = ChatCompletionCachedContent(**cache_control_param)  # type: ignore
+
+        anthropic_content_element["cache_control"] = transformed_param
+
     return anthropic_content_element
 
 
 def anthropic_messages_pt(
-    messages: list,
+    messages: List[AllMessageValues],
     model: str,
     llm_provider: str,
 ) -> List[
@@ -1348,10 +1354,21 @@ def anthropic_messages_pt(
     while msg_i < len(messages):
         user_content: List[AnthropicMessagesUserMessageValues] = []
         init_msg_i = msg_i
+        if isinstance(messages[msg_i], BaseModel):
+            messages[msg_i] = dict(messages[msg_i])  # type: ignore
         ## MERGE CONSECUTIVE USER CONTENT ##
         while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
-            if isinstance(messages[msg_i]["content"], list):
-                for m in messages[msg_i]["content"]:
+            user_message_types_block: Union[
+                ChatCompletionToolMessage,
+                ChatCompletionUserMessage,
+                ChatCompletionFunctionMessage,
+            ] = messages[
+                msg_i
+            ]  # type: ignore
+            if user_message_types_block["content"] and isinstance(
+                user_message_types_block["content"], list
+            ):
+                for m in user_message_types_block["content"]:
                     if m.get("type", "") == "image_url":
                         image_chunk = convert_to_anthropic_image_obj(
                             m["image_url"]["url"]
@@ -1382,15 +1399,24 @@ def anthropic_messages_pt(
                         )
                         user_content.append(anthropic_content_element)
             elif (
-                messages[msg_i]["role"] == "tool"
-                or messages[msg_i]["role"] == "function"
+                user_message_types_block["role"] == "tool"
+                or user_message_types_block["role"] == "function"
             ):
                 # OpenAI's tool message content will always be a string
-                user_content.append(convert_to_anthropic_tool_result(messages[msg_i]))
-            else:
                 user_content.append(
-                    {"type": "text", "text": messages[msg_i]["content"]}
+                    convert_to_anthropic_tool_result(user_message_types_block)
                 )
+            elif isinstance(user_message_types_block["content"], str):
+                _anthropic_content_text_element: AnthropicMessagesTextParam = {
+                    "type": "text",
+                    "text": user_message_types_block["content"],
+                }
+                anthropic_content_element = add_cache_control_to_content(
+                    anthropic_content_element=_anthropic_content_text_element,
+                    orignal_content_element=user_message_types_block,
+                )
+
+                user_content.append(anthropic_content_element)
 
             msg_i += 1
 
@@ -1400,10 +1426,11 @@ def anthropic_messages_pt(
         assistant_content: List[AnthropicMessagesAssistantMessageValues] = []
         ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
         while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
-            if "content" in messages[msg_i] and isinstance(
-                messages[msg_i]["content"], list
+            assistant_content_block: ChatCompletionAssistantMessage = messages[msg_i]  # type: ignore
+            if "content" in assistant_content_block and isinstance(
+                assistant_content_block["content"], list
             ):
-                for m in messages[msg_i]["content"]:
+                for m in assistant_content_block["content"]:
                     # handle text
                     if (
                         m.get("type", "") == "text" and len(m.get("text", "")) > 0
@@ -1417,35 +1444,37 @@ def anthropic_messages_pt(
                         )
                         assistant_content.append(anthropic_message)
             elif (
-                "content" in messages[msg_i]
-                and isinstance(messages[msg_i]["content"], str)
-                and len(messages[msg_i]["content"])
-                > 0  # don't pass empty text blocks. anthropic api raises errors.
+                "content" in assistant_content_block
+                and isinstance(assistant_content_block["content"], str)
+                and assistant_content_block[
+                    "content"
+                ]  # don't pass empty text blocks. anthropic api raises errors.
             ):
 
                 _anthropic_text_content_element = {
                     "type": "text",
-                    "text": messages[msg_i]["content"],
+                    "text": assistant_content_block["content"],
                 }
 
                 anthropic_content_element = add_cache_control_to_content(
                     anthropic_content_element=_anthropic_text_content_element,
-                    orignal_content_element=messages[msg_i],
+                    orignal_content_element=assistant_content_block,
                 )
                 assistant_content.append(anthropic_content_element)
 
-            if messages[msg_i].get(
-                "tool_calls", []
+            assistant_tool_calls = assistant_content_block.get("tool_calls")
+            if (
+                assistant_tool_calls is not None
             ):  # support assistant tool invoke conversion
                 assistant_content.extend(
-                    convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"])
+                    convert_to_anthropic_tool_invoke(assistant_tool_calls)
                 )
 
-            if messages[msg_i].get("function_call"):
+            assistant_function_call = assistant_content_block.get("function_call")
+
+            if assistant_function_call is not None:
                 assistant_content.extend(
-                    convert_function_to_anthropic_tool_invoke(
-                        messages[msg_i]["function_call"]
-                    )
+                    convert_function_to_anthropic_tool_invoke(assistant_function_call)
                 )
 
             msg_i += 1
diff --git a/litellm/llms/text_completion_codestral.py b/litellm/llms/text_completion_codestral.py
index 9dbe3bb37..9bcd64631 100644
--- a/litellm/llms/text_completion_codestral.py
+++ b/litellm/llms/text_completion_codestral.py
@@ -491,14 +491,9 @@ class CodestralTextCompletion(BaseLLM):
                 message="HTTPStatusError - {}".format(e.response.text),
             )
         except Exception as e:
-            verbose_logger.exception(
-                "litellm.llms.text_completion_codestral.py::async_completion() - Exception occurred - {}".format(
-                    str(e)
-                )
-            )
             raise TextCompletionCodestralError(
                 status_code=500, message="{}".format(str(e))
-            )
+            )  # don't use verbose_logger.exception, if exception is raised
         return self.process_text_completion_response(
             model=model,
             response=response,
diff --git a/litellm/main.py b/litellm/main.py
index e35014abb..9e7297e11 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -445,9 +445,6 @@ async def acompletion(
             )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
         return response
     except Exception as e:
-        verbose_logger.exception(
-            "litellm.main.py::acompletion() - Exception occurred - {}".format(str(e))
-        )
         custom_llm_provider = custom_llm_provider or "openai"
         raise exception_type(
             model=model,
@@ -616,9 +613,6 @@ def mock_completion(
     except Exception as e:
         if isinstance(e, openai.APIError):
             raise e
-        verbose_logger.exception(
-            "litellm.mock_completion(): Exception occured - {}".format(str(e))
-        )
         raise Exception("Mock completion response failed")
 
 
@@ -5125,9 +5119,6 @@ async def ahealth_check(
                 response = {}  # args like remaining ratelimit etc.
         return response
     except Exception as e:
-        verbose_logger.exception(
-            "litellm.ahealth_check(): Exception occured - {}".format(str(e))
-        )
         stack_trace = traceback.format_exc()
         if isinstance(stack_trace, str):
             stack_trace = stack_trace[:1000]
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 6c1078fa8..c015f2085 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,6 +1,16 @@
+
 model_list:
-  - model_name: "whisper"
-    litellm_params:
-      model: "azure/azure-whisper"
-      api_key: os.environ/AZURE_EUROPE_API_KEY
-      api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
+- model_name: gpt-4o-mini-2024-07-18
+  litellm_params:
+    api_key: API_KEY
+    model: openai/gpt-4o-mini-2024-07-18
+    rpm: 0
+    tpm: 100
+
+router_settings:
+  num_retries: 0
+  routing_strategy: usage-based-routing-v2
+  timeout: 10
+
+litellm_settings:
+  callbacks: custom_callbacks.proxy_handler_instance
diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py
index 4c16c0345..2480263df 100644
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@@ -386,7 +386,6 @@ async def user_api_key_auth(
                     parent_otel_span=parent_otel_span,
                 )
         #### ELSE ####
-
         ## CHECK PASS-THROUGH ENDPOINTS ##
         if pass_through_endpoints is not None:
             for endpoint in pass_through_endpoints:
diff --git a/litellm/proxy/custom_callbacks.py b/litellm/proxy/custom_callbacks.py
index 40fc0d369..1516bfd24 100644
--- a/litellm/proxy/custom_callbacks.py
+++ b/litellm/proxy/custom_callbacks.py
@@ -1,66 +1,10 @@
 from litellm.integrations.custom_logger import CustomLogger
-import litellm
 
 
-# This file includes the custom callbacks for LiteLLM Proxy
-# Once defined, these can be passed in proxy_config.yaml
 class MyCustomHandler(CustomLogger):
-    def log_pre_api_call(self, model, messages, kwargs):
-        print(f"Pre-API Call")  # noqa
-
-    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
-        print(f"Post-API Call")  # noqa
-
-    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Stream")  # noqa
-
-    def log_success_event(self, kwargs, response_obj, start_time, end_time):
-        print("On Success")  # noqa
-
-    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Failure")  # noqa
-
-    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"ishaan async_log_success_event")  # noqa
-        # log: key, user, model, prompt, response, tokens, cost
-        # Access kwargs passed to litellm.completion()
-        model = kwargs.get("model", None)
-        messages = kwargs.get("messages", None)
-        user = kwargs.get("user", None)
-
-        # Access litellm_params passed to litellm.completion(), example access `metadata`
-        litellm_params = kwargs.get("litellm_params", {})
-        metadata = litellm_params.get(
-            "metadata", {}
-        )  # headers passed to LiteLLM proxy, can be found here
-
-        return
-
     async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        try:
-            print(f"On Async Failure !")  # noqa
-            print("\nkwargs", kwargs)  # noqa
-            # Access kwargs passed to litellm.completion()
-            model = kwargs.get("model", None)
-            messages = kwargs.get("messages", None)
-            user = kwargs.get("user", None)
-
-            # Access litellm_params passed to litellm.completion(), example access `metadata`
-            litellm_params = kwargs.get("litellm_params", {})
-            metadata = litellm_params.get(
-                "metadata", {}
-            )  # headers passed to LiteLLM proxy, can be found here
-
-            # Acess Exceptions & Traceback
-            exception_event = kwargs.get("exception", None)
-            traceback_event = kwargs.get("traceback_exception", None)
-
-            # Calculate cost using  litellm.completion_cost()
-        except Exception as e:
-            print(f"Exception: {e}")  # noqa
+        # print("Call failed")
+        pass
 
 
 proxy_handler_instance = MyCustomHandler()
-
-# Set litellm.callbacks = [proxy_handler_instance] on the proxy
-# need to set litellm.callbacks = [proxy_handler_instance] # on the proxy
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index be8ccbaeb..b0eab1ba8 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -6183,6 +6183,64 @@ async def delete_end_user(
     pass
 
 
+@router.get(
+    "/customer/list",
+    tags=["Customer Management"],
+    dependencies=[Depends(user_api_key_auth)],
+    response_model=List[LiteLLM_EndUserTable],
+)
+@router.get(
+    "/end_user/list",
+    tags=["Customer Management"],
+    include_in_schema=False,
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def list_team(
+    http_request: Request,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    [Admin-only] List all available customers
+
+    ```
+    curl --location --request GET 'http://0.0.0.0:4000/customer/list' \
+        --header 'Authorization: Bearer sk-1234'
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
+        create_audit_log_for_update,
+        litellm_proxy_admin_name,
+        prisma_client,
+    )
+
+    if (
+        user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
+        and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
+    ):
+        raise HTTPException(
+            status_code=401,
+            detail={
+                "error": "Admin-only endpoint. Your user role={}".format(
+                    user_api_key_dict.user_role
+                )
+            },
+        )
+
+    if prisma_client is None:
+        raise HTTPException(
+            status_code=400,
+            detail={"error": CommonProxyErrors.db_not_connected_error.value},
+        )
+
+    response = await prisma_client.db.litellm_endusertable.find_many()
+
+    returned_response: List[LiteLLM_EndUserTable] = []
+    for item in response:
+        returned_response.append(LiteLLM_EndUserTable(**item.model_dump()))
+    return returned_response
+
+
 async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
     if premium_user is not True:
         return
diff --git a/litellm/router.py b/litellm/router.py
index 15fb4cb27..2743a36b9 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -47,6 +47,7 @@ from litellm._logging import verbose_router_logger
 from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.azure import get_azure_ad_token_from_oidc
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
@@ -783,6 +784,10 @@ class Router:
                 }
             )
 
+            logging_obj: Optional[LiteLLMLogging] = kwargs.get(
+                "litellm_logging_obj", None
+            )
+
             rpm_semaphore = self._get_client(
                 deployment=deployment,
                 kwargs=kwargs,
@@ -797,11 +802,13 @@ class Router:
                     - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
                     """
                     await self.async_routing_strategy_pre_call_checks(
-                        deployment=deployment
+                        deployment=deployment, logging_obj=logging_obj
                     )
                     response = await _response
             else:
-                await self.async_routing_strategy_pre_call_checks(deployment=deployment)
+                await self.async_routing_strategy_pre_call_checks(
+                    deployment=deployment, logging_obj=logging_obj
+                )
                 response = await _response
 
             ## CHECK CONTENT FILTER ERROR ##
@@ -3860,7 +3867,9 @@ class Router:
             if isinstance(_callback, CustomLogger):
                 response = _callback.pre_call_check(deployment)
 
-    async def async_routing_strategy_pre_call_checks(self, deployment: dict):
+    async def async_routing_strategy_pre_call_checks(
+        self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
+    ):
         """
         For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.
 
@@ -3875,8 +3884,22 @@ class Router:
         for _callback in litellm.callbacks:
             if isinstance(_callback, CustomLogger):
                 try:
-                    response = await _callback.async_pre_call_check(deployment)
+                    _ = await _callback.async_pre_call_check(deployment)
                 except litellm.RateLimitError as e:
+                    ## LOG FAILURE EVENT
+                    if logging_obj is not None:
+                        asyncio.create_task(
+                            logging_obj.async_failure_handler(
+                                exception=e,
+                                traceback_exception=traceback.format_exc(),
+                                end_time=time.time(),
+                            )
+                        )
+                        ## LOGGING
+                        threading.Thread(
+                            target=logging_obj.failure_handler,
+                            args=(e, traceback.format_exc()),
+                        ).start()  # log response
                     self._set_cooldown_deployments(
                         exception_status=e.status_code,
                         original_exception=e,
@@ -3885,6 +3908,20 @@ class Router:
                     )
                     raise e
                 except Exception as e:
+                    ## LOG FAILURE EVENT
+                    if logging_obj is not None:
+                        asyncio.create_task(
+                            logging_obj.async_failure_handler(
+                                exception=e,
+                                traceback_exception=traceback.format_exc(),
+                                end_time=time.time(),
+                            )
+                        )
+                        ## LOGGING
+                        threading.Thread(
+                            target=logging_obj.failure_handler,
+                            args=(e, traceback.format_exc()),
+                        ).start()  # log response
                     raise e
 
     def _generate_model_id(self, model_group: str, litellm_params: dict):
diff --git a/litellm/tests/test_anthropic_prompt_caching.py b/litellm/tests/test_anthropic_prompt_caching.py
index b9c70f0c3..06f6916ed 100644
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@@ -222,6 +222,94 @@ async def test_anthropic_api_prompt_caching_basic():
     )
 
 
+@pytest.mark.asyncio()
+async def test_anthropic_api_prompt_caching_with_content_str():
+    from litellm.llms.prompt_templates.factory import anthropic_messages_pt
+
+    system_message = [
+        {
+            "role": "system",
+            "content": "Here is the full text of a complex legal agreement",
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+    translated_system_message = litellm.AnthropicConfig().translate_system_message(
+        messages=system_message
+    )
+
+    assert translated_system_message == [
+        # System Message
+        {
+            "type": "text",
+            "text": "Here is the full text of a complex legal agreement",
+            "cache_control": {"type": "ephemeral"},
+        }
+    ]
+    user_messages = [
+        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+        {
+            "role": "user",
+            "content": "What are the key terms and conditions in this agreement?",
+            "cache_control": {"type": "ephemeral"},
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": "What are the key terms and conditions in this agreement?",
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+
+    translated_messages = anthropic_messages_pt(
+        messages=user_messages,
+        model="claude-3-5-sonnet-20240620",
+        llm_provider="anthropic",
+    )
+
+    expected_messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+                }
+            ],
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+    ]
+
+    assert len(translated_messages) == len(expected_messages)
+    for idx, i in enumerate(translated_messages):
+        assert (
+            i == expected_messages[idx]
+        ), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx])
+
+
 @pytest.mark.asyncio()
 async def test_anthropic_api_prompt_caching_no_headers():
     litellm.set_verbose = True
diff --git a/litellm/tests/test_bedrock_completion.py b/litellm/tests/test_bedrock_completion.py
index e6c657f07..bc27c5118 100644
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@@ -616,8 +616,8 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth():
             aws_region_name=aws_region_name,
             aws_web_identity_token=aws_web_identity_token,
             aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
-            aws_sts_endpoint="https://sts-fips.us-west-2.amazonaws.com",
+            aws_session_name="cross-region-test",
+            aws_sts_endpoint="https://sts-fips.us-east-2.amazonaws.com",
             aws_bedrock_runtime_endpoint="https://bedrock-runtime-fips.us-west-2.amazonaws.com",
         )
         # Add any assertions here to check the response
diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py
index 7b856a284..720abf8dd 100644
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@@ -3,6 +3,8 @@ from typing import Any, Dict, Iterable, List, Optional, Union
 from pydantic import BaseModel, validator
 from typing_extensions import Literal, Required, TypedDict
 
+from .openai import ChatCompletionCachedContent
+
 
 class AnthropicMessagesToolChoice(TypedDict, total=False):
     type: Required[Literal["auto", "any", "tool"]]
@@ -18,7 +20,7 @@ class AnthropicMessagesTool(TypedDict, total=False):
 class AnthropicMessagesTextParam(TypedDict, total=False):
     type: Literal["text"]
     text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
 class AnthropicMessagesToolUseParam(TypedDict):
@@ -58,7 +60,7 @@ class AnthropicImageParamSource(TypedDict):
 class AnthropicMessagesImageParam(TypedDict, total=False):
     type: Literal["image"]
     source: AnthropicImageParamSource
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
 class AnthropicMessagesToolResultContent(TypedDict):
@@ -97,7 +99,7 @@ class AnthropicMetadata(TypedDict, total=False):
 class AnthropicSystemMessageContent(TypedDict, total=False):
     type: str
     text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
 class AnthropicMessagesRequest(TypedDict, total=False):
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index 0219145c6..788199c00 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -354,14 +354,18 @@ class ChatCompletionImageObject(TypedDict):
     image_url: ChatCompletionImageUrlObject
 
 
-class ChatCompletionUserMessage(TypedDict):
+class OpenAIChatCompletionUserMessage(TypedDict):
     role: Literal["user"]
     content: Union[
         str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
     ]
 
 
-class ChatCompletionAssistantMessage(TypedDict, total=False):
+class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
+class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
     role: Required[Literal["assistant"]]
     content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
     name: Optional[str]
@@ -369,6 +373,10 @@ class ChatCompletionAssistantMessage(TypedDict, total=False):
     function_call: Optional[ChatCompletionToolCallFunctionChunk]
 
 
+class ChatCompletionAssistantMessage(OpenAIChatCompletionAssistantMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
 class ChatCompletionToolMessage(TypedDict):
     role: Literal["tool"]
     content: str
@@ -381,12 +389,16 @@ class ChatCompletionFunctionMessage(TypedDict):
     name: str
 
 
-class ChatCompletionSystemMessage(TypedDict, total=False):
+class OpenAIChatCompletionSystemMessage(TypedDict, total=False):
     role: Required[Literal["system"]]
     content: Required[Union[str, List]]
     name: str
 
 
+class ChatCompletionSystemMessage(OpenAIChatCompletionSystemMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
 AllMessageValues = Union[
     ChatCompletionUserMessage,
     ChatCompletionAssistantMessage,
diff --git a/litellm/utils.py b/litellm/utils.py
index 06de72b9d..33d3a59a3 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -8547,11 +8547,6 @@ class CustomStreamWrapper:
                 "finish_reason": finish_reason,
             }
         except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_predibase_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
             raise e
 
     def handle_huggingface_chunk(self, chunk):
@@ -8595,11 +8590,6 @@ class CustomStreamWrapper:
                 "finish_reason": finish_reason,
             }
         except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_huggingface_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
             raise e
 
     def handle_ai21_chunk(self, chunk):  # fake streaming
@@ -8826,11 +8816,6 @@ class CustomStreamWrapper:
                 "usage": usage,
             }
         except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_openai_chat_completion_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
             raise e
 
     def handle_azure_text_completion_chunk(self, chunk):