LiteLLM Minor fixes + improvements (08/04/2024) (#5505)

* Minor IAM AWS OIDC Improvements (#5246) * AWS IAM: Temporary tokens are valid across all regions after being issued, so it is wasteful to request one for each region. * AWS IAM: Include an inline policy, to help reduce misuse of overly permissive IAM roles. * (test_bedrock_completion.py): Ensure we are testing cross AWS region OIDC flow. * fix(router.py): log rejected requests Fixes https://github.com/BerriAI/litellm/issues/5498 * refactor: don't use verbose_logger.exception, if exception is raised User might already have handling for this. But alerting systems in prod will raise this as an unhandled error. * fix(datadog.py): support setting datadog source as an env var Fixes https://github.com/BerriAI/litellm/issues/5508 * docs(logging.md): add dd_source to datadog docs * fix(proxy_server.py): expose `/customer/list` endpoint for showing all customers * (bedrock): Fix usage with Cloudflare AI Gateway, and proxies in general. (#5509) * feat(anthropic.py): support 'cache_control' param for content when it is a string * Revert "(bedrock): Fix usage with Cloudflare AI Gateway, and proxies in gener…" (#5519) This reverts commit 3fac0349c2. * refactor: ci/cd run again --------- Co-authored-by: David Manouchehri <david.manouchehri@ai.moda>
2024-09-04 22:16:55 -07:00 · 2024-09-04 22:16:55 -07:00 · 1e7e538261
commit 1e7e538261
parent cdc312d51d
24 changed files with 383 additions and 247 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,12 +1,12 @@
 repos:
 -   repo: local
    hooks:
-    -   id: mypy
-        name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
-        language: system
-        types: [python]
-        files: ^litellm/
+    # -   id: mypy
+    #     name: mypy
+    #     entry: python3 -m mypy --ignore-missing-imports
+    #     language: system
+    #     types: [python]
+    #     files: ^litellm/
    -   id: isort
        name: isort
        entry: isort
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1426,6 +1426,7 @@ litellm_settings:
 ```shell
 DD_API_KEY="5f2d0f310***********" # your datadog API Key
 DD_SITE="us5.datadoghq.com"       # your datadog base url
+DD_SOURCE="litellm_dev"       # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments
 ```

 **Step 3**: Start the proxy, make a test request
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -2039,10 +2039,7 @@ class DualCache(BaseCache):

            return result
        except Exception as e:
-            verbose_logger.exception(
-                f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
-            )
-            raise e
+            raise e  # don't log if exception is raised

    async def async_set_cache_sadd(
        self, key, value: List, local_only: bool = False, **kwargs
@ -2069,10 +2066,7 @@ class DualCache(BaseCache):

            return None
        except Exception as e:
-            verbose_logger.exception(
-                "LiteLLM Cache: Excepton async set_cache_sadd: {}".format(str(e))
-            )
-            raise e
+            raise e  # don't log, if exception is raised

    def flush_cache(self):
        if self.in_memory_cache is not None:
@ -2543,7 +2537,6 @@ class Cache:
            self.cache.set_cache(cache_key, cached_data, **kwargs)
        except Exception as e:
            verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
-            pass

    async def async_add_cache(self, result, *args, **kwargs):
        """
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -235,10 +235,7 @@ class BraintrustLogger(CustomLogger):
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
-            verbose_logger.exception(
-                "Error logging to braintrust - Exception received - {}".format(str(e))
-            )
-            raise e
+            raise e  # don't use verbose_logger.exception, if exception is raised

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
@ -360,10 +357,7 @@ class BraintrustLogger(CustomLogger):
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
-            verbose_logger.exception(
-                "Error logging to braintrust - Exception received - {}".format(str(e))
-            )
-            raise e
+            raise e  # don't use verbose_logger.exception, if exception is raised

    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        return super().log_failure_event(kwargs, response_obj, start_time, end_time)
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@ -1,11 +1,17 @@
 #### What this does ####
 #    On success + failure, log events to Datadog

-import dotenv, os
-import requests  # type: ignore
+import datetime
+import os
+import subprocess
+import sys
 import traceback
-import datetime, subprocess, sys
-import litellm, uuid
+import uuid
+
+import dotenv
+import requests  # type: ignore
+
+import litellm
 from litellm._logging import print_verbose, verbose_logger


@ -57,9 +63,9 @@ class DataDogLogger:
    ):
        try:
            # Define DataDog client
-            from datadog_api_client.v2.api.logs_api import LogsApi
            from datadog_api_client.v2 import ApiClient
-            from datadog_api_client.v2.models import HTTPLogItem, HTTPLog
+            from datadog_api_client.v2.api.logs_api import LogsApi
+            from datadog_api_client.v2.models import HTTPLog, HTTPLogItem

            verbose_logger.debug(
                f"datadog Logging - Enters logging function for model {kwargs}"
@ -131,7 +137,7 @@ class DataDogLogger:
                body = HTTPLog(
                    [
                        HTTPLogItem(
-                            ddsource="litellm",
+                            ddsource=os.getenv("DD_SOURCE", "litellm"),
                            message=payload,
                            service="litellm-server",
                        ),
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@ -228,6 +228,54 @@ class AnthropicConfig:

        return False

+    def translate_system_message(
+        self, messages: List[AllMessageValues]
+    ) -> List[AnthropicSystemMessageContent]:
+        system_prompt_indices = []
+        anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
+        for idx, message in enumerate(messages):
+            if message["role"] == "system":
+                valid_content: bool = False
+                system_message_block = ChatCompletionSystemMessage(**message)
+                if isinstance(system_message_block["content"], str):
+                    anthropic_system_message_content = AnthropicSystemMessageContent(
+                        type="text",
+                        text=system_message_block["content"],
+                    )
+                    if "cache_control" in system_message_block:
+                        anthropic_system_message_content["cache_control"] = (
+                            system_message_block["cache_control"]
+                        )
+                    anthropic_system_message_list.append(
+                        anthropic_system_message_content
+                    )
+                    valid_content = True
+                elif isinstance(message["content"], list):
+                    for _content in message["content"]:
+                        anthropic_system_message_content = (
+                            AnthropicSystemMessageContent(
+                                type=_content.get("type"),
+                                text=_content.get("text"),
+                            )
+                        )
+                        if "cache_control" in _content:
+                            anthropic_system_message_content["cache_control"] = (
+                                _content["cache_control"]
+                            )
+
+                        anthropic_system_message_list.append(
+                            anthropic_system_message_content
+                        )
+                    valid_content = True
+
+                if valid_content:
+                    system_prompt_indices.append(idx)
+        if len(system_prompt_indices) > 0:
+            for idx in reversed(system_prompt_indices):
+                messages.pop(idx)
+
+        return anthropic_system_message_list
+
    ### FOR [BETA] `/v1/messages` endpoint support

    def translatable_anthropic_params(self) -> List:
@ -314,7 +362,7 @@ class AnthropicConfig:
                new_messages.append(user_message)

            if len(new_user_content_list) > 0:
-                new_messages.append({"role": "user", "content": new_user_content_list})
+                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore

            if len(tool_message_list) > 0:
                new_messages.extend(tool_message_list)
@ -940,45 +988,11 @@ class AnthropicChatCompletion(BaseLLM):
            )
        else:
            # Separate system prompt from rest of message
-            system_prompt_indices = []
-            system_prompt = ""
-            anthropic_system_message_list = None
-            for idx, message in enumerate(messages):
-                if message["role"] == "system":
-                    valid_content: bool = False
-                    if isinstance(message["content"], str):
-                        system_prompt += message["content"]
-                        valid_content = True
-                    elif isinstance(message["content"], list):
-                        for _content in message["content"]:
-                            anthropic_system_message_content = (
-                                AnthropicSystemMessageContent(
-                                    type=_content.get("type"),
-                                    text=_content.get("text"),
-                                )
-                            )
-                            if "cache_control" in _content:
-                                anthropic_system_message_content["cache_control"] = (
-                                    _content["cache_control"]
-                                )
-
-                            if anthropic_system_message_list is None:
-                                anthropic_system_message_list = []
-                            anthropic_system_message_list.append(
-                                anthropic_system_message_content
-                            )
-                        valid_content = True
-
-                    if valid_content:
-                        system_prompt_indices.append(idx)
-            if len(system_prompt_indices) > 0:
-                for idx in reversed(system_prompt_indices):
-                    messages.pop(idx)
-            if len(system_prompt) > 0:
-                optional_params["system"] = system_prompt
-
+            anthropic_system_message_list = AnthropicConfig().translate_system_message(
+                messages=messages
+            )
            # Handling anthropic API Prompt Caching
-            if anthropic_system_message_list is not None:
+            if len(anthropic_system_message_list) > 0:
                optional_params["system"] = anthropic_system_message_list
            # Format rest of message according to anthropic guidelines
            try:
@ -986,15 +1000,10 @@ class AnthropicChatCompletion(BaseLLM):
                    model=model, messages=messages, custom_llm_provider="anthropic"
                )
            except Exception as e:
-                verbose_logger.exception(
-                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
-                        str(e), messages
-                    )
-                )
                raise AnthropicError(
                    status_code=400,
                    message="{}\nReceived Messages={}".format(str(e), messages),
-                )
+                )  # don't use verbose_logger.exception, if exception is raised

        ## Load Config
        config = litellm.AnthropicConfig.get_config()
--- a/litellm/llms/base_aws_llm.py
+++ b/litellm/llms/base_aws_llm.py
@ -119,8 +119,6 @@ class BaseAWSLLM(BaseLLM):
                    "aws_web_identity_token": aws_web_identity_token,
                    "aws_role_name": aws_role_name,
                    "aws_session_name": aws_session_name,
-                    "aws_region_name": aws_region_name,
-                    "aws_sts_endpoint": sts_endpoint,
                }
            )

@ -147,6 +145,7 @@ class BaseAWSLLM(BaseLLM):
                    RoleSessionName=aws_session_name,
                    WebIdentityToken=oidc_token,
                    DurationSeconds=3600,
+                    Policy='{"Version":"2012-10-17","Statement":[{"Sid":"BedrockLiteLLM","Effect":"Allow","Action":["bedrock:InvokeModel","bedrock:InvokeModelWithResponseStream"],"Resource":"*","Condition":{"Bool":{"aws:SecureTransport":"true"},"StringLike":{"aws:UserAgent":"litellm/*"}}}]}',
                )

                iam_creds_dict = {
@ -164,6 +163,11 @@ class BaseAWSLLM(BaseLLM):
                    ttl=3600 - 60,
                )

+                if sts_response["PackedPolicySize"] > 75:
+                    verbose_logger.warning(
+                        f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
+                    )
+
            session = boto3.Session(**iam_creds_dict)

            iam_creds = session.get_credentials()
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -423,13 +423,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                async for transformed_chunk in streamwrapper:
                    yield transformed_chunk
    except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama.py::ollama_async_streaming(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised


 async def ollama_acompletion(
@ -498,12 +492,7 @@ async def ollama_acompletion(
            )
            return model_response
    except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama.py::ollama_acompletion(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised


 async def ollama_aembeddings(
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -583,8 +583,4 @@ async def ollama_acompletion(
            )
            return model_response
    except Exception as e:
-        verbose_logger.exception(
-            "LiteLLM.ollama_acompletion(): Exception occured - {}".format(str(e))
-        )
-
-        raise e
+        raise e  # don't use verbose_logger.exception, if exception is raised
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -168,9 +168,6 @@ def completion(
            choices_list.append(choice_obj)
        model_response.choices = choices_list  # type: ignore
    except Exception as e:
-        verbose_logger.exception(
-            "litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
-        )
        raise PalmError(
            message=traceback.format_exc(), status_code=response.status_code
        )
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -564,12 +564,9 @@ class PredibaseChatCompletion(BaseLLM):
            for exception in litellm.LITELLM_EXCEPTION_TYPES:
                if isinstance(e, exception):
                    raise e
-            verbose_logger.exception(
-                "litellm.llms.predibase.py::async_completion() - Exception occurred - {}".format(
-                    str(e)
-                )
-            )
-            raise PredibaseError(status_code=500, message="{}".format(str(e)))
+            raise PredibaseError(
+                status_code=500, message="{}".format(str(e))
+            )  # don't use verbose_logger.exception, if exception is raised
        return self.process_response(
            model=model,
            response=response,
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -27,10 +27,13 @@ from litellm.types.completion import (
 from litellm.types.llms.anthropic import *
 from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
 from litellm.types.llms.openai import (
+    AllMessageValues,
    ChatCompletionAssistantMessage,
+    ChatCompletionAssistantToolCall,
    ChatCompletionFunctionMessage,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionToolMessage,
+    ChatCompletionUserMessage,
 )
 from litellm.types.utils import GenericImageParsingChunk

@ -493,10 +496,9 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =

        return rendered_text
    except Exception as e:
-        verbose_logger.exception(
-            "Error rendering huggingface chat template - {}".format(str(e))
-        )
-        raise Exception(f"Error rendering template - {str(e)}")
+        raise Exception(
+            f"Error rendering template - {str(e)}"
+        )  # don't use verbose_logger.exception, if exception is raised


 # Anthropic template
@ -1171,7 +1173,9 @@ def convert_to_gemini_tool_call_result(
    return _part


-def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResultParam:
+def convert_to_anthropic_tool_result(
+    message: Union[dict, ChatCompletionToolMessage, ChatCompletionFunctionMessage]
+) -> AnthropicMessagesToolResultParam:
    """
    OpenAI message with a tool result looks like:
    {
@ -1215,7 +1219,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
        return anthropic_tool_result
    if message["role"] == "function":
        content = message.get("content")  # type: ignore
-        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())
+        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())  # type: ignore
        anthropic_tool_result = AnthropicMessagesToolResultParam(
            type="tool_result", tool_use_id=tool_call_id, content=content
        )
@ -1230,7 +1234,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu


 def convert_function_to_anthropic_tool_invoke(
-    function_call,
+    function_call: Union[dict, ChatCompletionToolCallFunctionChunk],
 ) -> List[AnthropicMessagesToolUseParam]:
    try:
        anthropic_tool_invoke = [
@ -1247,7 +1251,7 @@ def convert_function_to_anthropic_tool_invoke(


 def convert_to_anthropic_tool_invoke(
-    tool_calls: list,
+    tool_calls: List[ChatCompletionAssistantToolCall],
 ) -> List[AnthropicMessagesToolUseParam]:
    """
    OpenAI tool invokes:
@ -1307,17 +1311,19 @@ def add_cache_control_to_content(
    anthropic_content_element: Union[
        dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
    ],
-    orignal_content_element: dict,
+    orignal_content_element: Union[dict, AllMessageValues],
 ):
-    if "cache_control" in orignal_content_element:
-        anthropic_content_element["cache_control"] = orignal_content_element[
-            "cache_control"
-        ]
+    cache_control_param = orignal_content_element.get("cache_control")
+    if cache_control_param is not None and isinstance(cache_control_param, dict):
+        transformed_param = ChatCompletionCachedContent(**cache_control_param)  # type: ignore
+
+        anthropic_content_element["cache_control"] = transformed_param
+
    return anthropic_content_element


 def anthropic_messages_pt(
-    messages: list,
+    messages: List[AllMessageValues],
    model: str,
    llm_provider: str,
 ) -> List[
@ -1348,10 +1354,21 @@ def anthropic_messages_pt(
    while msg_i < len(messages):
        user_content: List[AnthropicMessagesUserMessageValues] = []
        init_msg_i = msg_i
+        if isinstance(messages[msg_i], BaseModel):
+            messages[msg_i] = dict(messages[msg_i])  # type: ignore
        ## MERGE CONSECUTIVE USER CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
-            if isinstance(messages[msg_i]["content"], list):
-                for m in messages[msg_i]["content"]:
+            user_message_types_block: Union[
+                ChatCompletionToolMessage,
+                ChatCompletionUserMessage,
+                ChatCompletionFunctionMessage,
+            ] = messages[
+                msg_i
+            ]  # type: ignore
+            if user_message_types_block["content"] and isinstance(
+                user_message_types_block["content"], list
+            ):
+                for m in user_message_types_block["content"]:
                    if m.get("type", "") == "image_url":
                        image_chunk = convert_to_anthropic_image_obj(
                            m["image_url"]["url"]
@ -1382,15 +1399,24 @@ def anthropic_messages_pt(
                        )
                        user_content.append(anthropic_content_element)
            elif (
-                messages[msg_i]["role"] == "tool"
-                or messages[msg_i]["role"] == "function"
+                user_message_types_block["role"] == "tool"
+                or user_message_types_block["role"] == "function"
            ):
                # OpenAI's tool message content will always be a string
-                user_content.append(convert_to_anthropic_tool_result(messages[msg_i]))
-            else:
                user_content.append(
-                    {"type": "text", "text": messages[msg_i]["content"]}
+                    convert_to_anthropic_tool_result(user_message_types_block)
                )
+            elif isinstance(user_message_types_block["content"], str):
+                _anthropic_content_text_element: AnthropicMessagesTextParam = {
+                    "type": "text",
+                    "text": user_message_types_block["content"],
+                }
+                anthropic_content_element = add_cache_control_to_content(
+                    anthropic_content_element=_anthropic_content_text_element,
+                    orignal_content_element=user_message_types_block,
+                )
+
+                user_content.append(anthropic_content_element)

            msg_i += 1

@ -1400,10 +1426,11 @@ def anthropic_messages_pt(
        assistant_content: List[AnthropicMessagesAssistantMessageValues] = []
        ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
-            if "content" in messages[msg_i] and isinstance(
-                messages[msg_i]["content"], list
+            assistant_content_block: ChatCompletionAssistantMessage = messages[msg_i]  # type: ignore
+            if "content" in assistant_content_block and isinstance(
+                assistant_content_block["content"], list
            ):
-                for m in messages[msg_i]["content"]:
+                for m in assistant_content_block["content"]:
                    # handle text
                    if (
                        m.get("type", "") == "text" and len(m.get("text", "")) > 0
@ -1417,35 +1444,37 @@ def anthropic_messages_pt(
                        )
                        assistant_content.append(anthropic_message)
            elif (
-                "content" in messages[msg_i]
-                and isinstance(messages[msg_i]["content"], str)
-                and len(messages[msg_i]["content"])
-                > 0  # don't pass empty text blocks. anthropic api raises errors.
+                "content" in assistant_content_block
+                and isinstance(assistant_content_block["content"], str)
+                and assistant_content_block[
+                    "content"
+                ]  # don't pass empty text blocks. anthropic api raises errors.
            ):

                _anthropic_text_content_element = {
                    "type": "text",
-                    "text": messages[msg_i]["content"],
+                    "text": assistant_content_block["content"],
                }

                anthropic_content_element = add_cache_control_to_content(
                    anthropic_content_element=_anthropic_text_content_element,
-                    orignal_content_element=messages[msg_i],
+                    orignal_content_element=assistant_content_block,
                )
                assistant_content.append(anthropic_content_element)

-            if messages[msg_i].get(
-                "tool_calls", []
+            assistant_tool_calls = assistant_content_block.get("tool_calls")
+            if (
+                assistant_tool_calls is not None
            ):  # support assistant tool invoke conversion
                assistant_content.extend(
-                    convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"])
+                    convert_to_anthropic_tool_invoke(assistant_tool_calls)
                )

-            if messages[msg_i].get("function_call"):
+            assistant_function_call = assistant_content_block.get("function_call")
+
+            if assistant_function_call is not None:
                assistant_content.extend(
-                    convert_function_to_anthropic_tool_invoke(
-                        messages[msg_i]["function_call"]
-                    )
+                    convert_function_to_anthropic_tool_invoke(assistant_function_call)
                )

            msg_i += 1
--- a/litellm/llms/text_completion_codestral.py
+++ b/litellm/llms/text_completion_codestral.py
@ -491,14 +491,9 @@ class CodestralTextCompletion(BaseLLM):
                message="HTTPStatusError - {}".format(e.response.text),
            )
        except Exception as e:
-            verbose_logger.exception(
-                "litellm.llms.text_completion_codestral.py::async_completion() - Exception occurred - {}".format(
-                    str(e)
-                )
-            )
            raise TextCompletionCodestralError(
                status_code=500, message="{}".format(str(e))
-            )
+            )  # don't use verbose_logger.exception, if exception is raised
        return self.process_text_completion_response(
            model=model,
            response=response,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -445,9 +445,6 @@ async def acompletion(
            )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
        return response
    except Exception as e:
-        verbose_logger.exception(
-            "litellm.main.py::acompletion() - Exception occurred - {}".format(str(e))
-        )
        custom_llm_provider = custom_llm_provider or "openai"
        raise exception_type(
            model=model,
@ -616,9 +613,6 @@ def mock_completion(
    except Exception as e:
        if isinstance(e, openai.APIError):
            raise e
-        verbose_logger.exception(
-            "litellm.mock_completion(): Exception occured - {}".format(str(e))
-        )
        raise Exception("Mock completion response failed")


@ -5125,9 +5119,6 @@ async def ahealth_check(
                response = {}  # args like remaining ratelimit etc.
        return response
    except Exception as e:
-        verbose_logger.exception(
-            "litellm.ahealth_check(): Exception occured - {}".format(str(e))
-        )
        stack_trace = traceback.format_exc()
        if isinstance(stack_trace, str):
            stack_trace = stack_trace[:1000]
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,6 +1,16 @@
+
 model_list:
-  - model_name: "whisper"
-    litellm_params:
-      model: "azure/azure-whisper"
-      api_key: os.environ/AZURE_EUROPE_API_KEY
-      api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
+- model_name: gpt-4o-mini-2024-07-18
+  litellm_params:
+    api_key: API_KEY
+    model: openai/gpt-4o-mini-2024-07-18
+    rpm: 0
+    tpm: 100
+
+router_settings:
+  num_retries: 0
+  routing_strategy: usage-based-routing-v2
+  timeout: 10
+
+litellm_settings:
+  callbacks: custom_callbacks.proxy_handler_instance
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -386,7 +386,6 @@ async def user_api_key_auth(
                    parent_otel_span=parent_otel_span,
                )
        #### ELSE ####
-
        ## CHECK PASS-THROUGH ENDPOINTS ##
        if pass_through_endpoints is not None:
            for endpoint in pass_through_endpoints:
--- a/litellm/proxy/custom_callbacks.py
+++ b/litellm/proxy/custom_callbacks.py
@ -1,66 +1,10 @@
 from litellm.integrations.custom_logger import CustomLogger
-import litellm


-# This file includes the custom callbacks for LiteLLM Proxy
-# Once defined, these can be passed in proxy_config.yaml
 class MyCustomHandler(CustomLogger):
-    def log_pre_api_call(self, model, messages, kwargs):
-        print(f"Pre-API Call")  # noqa
-
-    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
-        print(f"Post-API Call")  # noqa
-
-    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Stream")  # noqa
-
-    def log_success_event(self, kwargs, response_obj, start_time, end_time):
-        print("On Success")  # noqa
-
-    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Failure")  # noqa
-
-    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"ishaan async_log_success_event")  # noqa
-        # log: key, user, model, prompt, response, tokens, cost
-        # Access kwargs passed to litellm.completion()
-        model = kwargs.get("model", None)
-        messages = kwargs.get("messages", None)
-        user = kwargs.get("user", None)
-
-        # Access litellm_params passed to litellm.completion(), example access `metadata`
-        litellm_params = kwargs.get("litellm_params", {})
-        metadata = litellm_params.get(
-            "metadata", {}
-        )  # headers passed to LiteLLM proxy, can be found here
-
-        return
-
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        try:
-            print(f"On Async Failure !")  # noqa
-            print("\nkwargs", kwargs)  # noqa
-            # Access kwargs passed to litellm.completion()
-            model = kwargs.get("model", None)
-            messages = kwargs.get("messages", None)
-            user = kwargs.get("user", None)
-
-            # Access litellm_params passed to litellm.completion(), example access `metadata`
-            litellm_params = kwargs.get("litellm_params", {})
-            metadata = litellm_params.get(
-                "metadata", {}
-            )  # headers passed to LiteLLM proxy, can be found here
-
-            # Acess Exceptions & Traceback
-            exception_event = kwargs.get("exception", None)
-            traceback_event = kwargs.get("traceback_exception", None)
-
-            # Calculate cost using  litellm.completion_cost()
-        except Exception as e:
-            print(f"Exception: {e}")  # noqa
+        # print("Call failed")
+        pass


 proxy_handler_instance = MyCustomHandler()
-
-# Set litellm.callbacks = [proxy_handler_instance] on the proxy
-# need to set litellm.callbacks = [proxy_handler_instance] # on the proxy
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -6183,6 +6183,64 @@ async def delete_end_user(
    pass


+@router.get(
+    "/customer/list",
+    tags=["Customer Management"],
+    dependencies=[Depends(user_api_key_auth)],
+    response_model=List[LiteLLM_EndUserTable],
+)
+@router.get(
+    "/end_user/list",
+    tags=["Customer Management"],
+    include_in_schema=False,
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def list_team(
+    http_request: Request,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    [Admin-only] List all available customers
+
+    ```
+    curl --location --request GET 'http://0.0.0.0:4000/customer/list' \
+        --header 'Authorization: Bearer sk-1234'
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
+        create_audit_log_for_update,
+        litellm_proxy_admin_name,
+        prisma_client,
+    )
+
+    if (
+        user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
+        and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
+    ):
+        raise HTTPException(
+            status_code=401,
+            detail={
+                "error": "Admin-only endpoint. Your user role={}".format(
+                    user_api_key_dict.user_role
+                )
+            },
+        )
+
+    if prisma_client is None:
+        raise HTTPException(
+            status_code=400,
+            detail={"error": CommonProxyErrors.db_not_connected_error.value},
+        )
+
+    response = await prisma_client.db.litellm_endusertable.find_many()
+
+    returned_response: List[LiteLLM_EndUserTable] = []
+    for item in response:
+        returned_response.append(LiteLLM_EndUserTable(**item.model_dump()))
+    return returned_response
+
+
 async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
    if premium_user is not True:
        return
--- a/litellm/router.py
+++ b/litellm/router.py
@ -47,6 +47,7 @@ from litellm._logging import verbose_router_logger
 from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.azure import get_azure_ad_token_from_oidc
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
@ -783,6 +784,10 @@ class Router:
                }
            )

+            logging_obj: Optional[LiteLLMLogging] = kwargs.get(
+                "litellm_logging_obj", None
+            )
+
            rpm_semaphore = self._get_client(
                deployment=deployment,
                kwargs=kwargs,
@ -797,11 +802,13 @@ class Router:
                    - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
                    """
                    await self.async_routing_strategy_pre_call_checks(
-                        deployment=deployment
+                        deployment=deployment, logging_obj=logging_obj
                    )
                    response = await _response
            else:
-                await self.async_routing_strategy_pre_call_checks(deployment=deployment)
+                await self.async_routing_strategy_pre_call_checks(
+                    deployment=deployment, logging_obj=logging_obj
+                )
                response = await _response

            ## CHECK CONTENT FILTER ERROR ##
@ -3860,7 +3867,9 @@ class Router:
            if isinstance(_callback, CustomLogger):
                response = _callback.pre_call_check(deployment)

-    async def async_routing_strategy_pre_call_checks(self, deployment: dict):
+    async def async_routing_strategy_pre_call_checks(
+        self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
+    ):
        """
        For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.

@ -3875,8 +3884,22 @@ class Router:
        for _callback in litellm.callbacks:
            if isinstance(_callback, CustomLogger):
                try:
-                    response = await _callback.async_pre_call_check(deployment)
+                    _ = await _callback.async_pre_call_check(deployment)
                except litellm.RateLimitError as e:
+                    ## LOG FAILURE EVENT
+                    if logging_obj is not None:
+                        asyncio.create_task(
+                            logging_obj.async_failure_handler(
+                                exception=e,
+                                traceback_exception=traceback.format_exc(),
+                                end_time=time.time(),
+                            )
+                        )
+                        ## LOGGING
+                        threading.Thread(
+                            target=logging_obj.failure_handler,
+                            args=(e, traceback.format_exc()),
+                        ).start()  # log response
                    self._set_cooldown_deployments(
                        exception_status=e.status_code,
                        original_exception=e,
@ -3885,6 +3908,20 @@ class Router:
                    )
                    raise e
                except Exception as e:
+                    ## LOG FAILURE EVENT
+                    if logging_obj is not None:
+                        asyncio.create_task(
+                            logging_obj.async_failure_handler(
+                                exception=e,
+                                traceback_exception=traceback.format_exc(),
+                                end_time=time.time(),
+                            )
+                        )
+                        ## LOGGING
+                        threading.Thread(
+                            target=logging_obj.failure_handler,
+                            args=(e, traceback.format_exc()),
+                        ).start()  # log response
                    raise e

    def _generate_model_id(self, model_group: str, litellm_params: dict):
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@ -222,6 +222,94 @@ async def test_anthropic_api_prompt_caching_basic():
    )


+@pytest.mark.asyncio()
+async def test_anthropic_api_prompt_caching_with_content_str():
+    from litellm.llms.prompt_templates.factory import anthropic_messages_pt
+
+    system_message = [
+        {
+            "role": "system",
+            "content": "Here is the full text of a complex legal agreement",
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+    translated_system_message = litellm.AnthropicConfig().translate_system_message(
+        messages=system_message
+    )
+
+    assert translated_system_message == [
+        # System Message
+        {
+            "type": "text",
+            "text": "Here is the full text of a complex legal agreement",
+            "cache_control": {"type": "ephemeral"},
+        }
+    ]
+    user_messages = [
+        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+        {
+            "role": "user",
+            "content": "What are the key terms and conditions in this agreement?",
+            "cache_control": {"type": "ephemeral"},
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": "What are the key terms and conditions in this agreement?",
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+
+    translated_messages = anthropic_messages_pt(
+        messages=user_messages,
+        model="claude-3-5-sonnet-20240620",
+        llm_provider="anthropic",
+    )
+
+    expected_messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+                }
+            ],
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+    ]
+
+    assert len(translated_messages) == len(expected_messages)
+    for idx, i in enumerate(translated_messages):
+        assert (
+            i == expected_messages[idx]
+        ), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx])
+
+
@pytest.mark.asyncio()
 async def test_anthropic_api_prompt_caching_no_headers():
    litellm.set_verbose = True
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@ -616,8 +616,8 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth():
            aws_region_name=aws_region_name,
            aws_web_identity_token=aws_web_identity_token,
            aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
-            aws_sts_endpoint="https://sts-fips.us-west-2.amazonaws.com",
+            aws_session_name="cross-region-test",
+            aws_sts_endpoint="https://sts-fips.us-east-2.amazonaws.com",
            aws_bedrock_runtime_endpoint="https://bedrock-runtime-fips.us-west-2.amazonaws.com",
        )
        # Add any assertions here to check the response
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -3,6 +3,8 @@ from typing import Any, Dict, Iterable, List, Optional, Union
 from pydantic import BaseModel, validator
 from typing_extensions import Literal, Required, TypedDict

+from .openai import ChatCompletionCachedContent
+

 class AnthropicMessagesToolChoice(TypedDict, total=False):
    type: Required[Literal["auto", "any", "tool"]]
@ -18,7 +20,7 @@ class AnthropicMessagesTool(TypedDict, total=False):
 class AnthropicMessagesTextParam(TypedDict, total=False):
    type: Literal["text"]
    text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]


 class AnthropicMessagesToolUseParam(TypedDict):
@ -58,7 +60,7 @@ class AnthropicImageParamSource(TypedDict):
 class AnthropicMessagesImageParam(TypedDict, total=False):
    type: Literal["image"]
    source: AnthropicImageParamSource
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]


 class AnthropicMessagesToolResultContent(TypedDict):
@ -97,7 +99,7 @@ class AnthropicMetadata(TypedDict, total=False):
 class AnthropicSystemMessageContent(TypedDict, total=False):
    type: str
    text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]


 class AnthropicMessagesRequest(TypedDict, total=False):
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -354,14 +354,18 @@ class ChatCompletionImageObject(TypedDict):
    image_url: ChatCompletionImageUrlObject


-class ChatCompletionUserMessage(TypedDict):
+class OpenAIChatCompletionUserMessage(TypedDict):
    role: Literal["user"]
    content: Union[
        str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
    ]


-class ChatCompletionAssistantMessage(TypedDict, total=False):
+class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
+class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
    role: Required[Literal["assistant"]]
    content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
    name: Optional[str]
@ -369,6 +373,10 @@ class ChatCompletionAssistantMessage(TypedDict, total=False):
    function_call: Optional[ChatCompletionToolCallFunctionChunk]


+class ChatCompletionAssistantMessage(OpenAIChatCompletionAssistantMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
 class ChatCompletionToolMessage(TypedDict):
    role: Literal["tool"]
    content: str
@ -381,12 +389,16 @@ class ChatCompletionFunctionMessage(TypedDict):
    name: str


-class ChatCompletionSystemMessage(TypedDict, total=False):
+class OpenAIChatCompletionSystemMessage(TypedDict, total=False):
    role: Required[Literal["system"]]
    content: Required[Union[str, List]]
    name: str


+class ChatCompletionSystemMessage(OpenAIChatCompletionSystemMessage, total=False):
+    cache_control: ChatCompletionCachedContent
+
+
 AllMessageValues = Union[
    ChatCompletionUserMessage,
    ChatCompletionAssistantMessage,
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -8547,11 +8547,6 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }
        except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_predibase_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
            raise e

    def handle_huggingface_chunk(self, chunk):
@ -8595,11 +8590,6 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }
        except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_huggingface_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
            raise e

    def handle_ai21_chunk(self, chunk):  # fake streaming
@ -8826,11 +8816,6 @@ class CustomStreamWrapper:
                "usage": usage,
            }
        except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_openai_chat_completion_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
            raise e

    def handle_azure_text_completion_chunk(self, chunk):