LiteLLM Minor fixes + improvements (08/04/2024) (#5505)

* Minor IAM AWS OIDC Improvements (#5246) * AWS IAM: Temporary tokens are valid across all regions after being issued, so it is wasteful to request one for each region. * AWS IAM: Include an inline policy, to help reduce misuse of overly permissive IAM roles. * (test_bedrock_completion.py): Ensure we are testing cross AWS region OIDC flow. * fix(router.py): log rejected requests Fixes https://github.com/BerriAI/litellm/issues/5498 * refactor: don't use verbose_logger.exception, if exception is raised User might already have handling for this. But alerting systems in prod will raise this as an unhandled error. * fix(datadog.py): support setting datadog source as an env var Fixes https://github.com/BerriAI/litellm/issues/5508 * docs(logging.md): add dd_source to datadog docs * fix(proxy_server.py): expose `/customer/list` endpoint for showing all customers * (bedrock): Fix usage with Cloudflare AI Gateway, and proxies in general. (#5509) * feat(anthropic.py): support 'cache_control' param for content when it is a string * Revert "(bedrock): Fix usage with Cloudflare AI Gateway, and proxies in gener…" (#5519) This reverts commit 3fac0349c2. * refactor: ci/cd run again --------- Co-authored-by: David Manouchehri <david.manouchehri@ai.moda>
2024-09-04 22:16:55 -07:00 · 2024-09-04 22:16:55 -07:00 · 1e7e538261
commit 1e7e538261
parent cdc312d51d
24 changed files with 383 additions and 247 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,12 +1,12 @@
 repos:
 -   repo: local
    hooks:
-    -   id: mypy
+    # -   id: mypy
-        name: mypy
+    #     name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
+    #     entry: python3 -m mypy --ignore-missing-imports
-        language: system
+    #     language: system
-        types: [python]
+    #     types: [python]
-        files: ^litellm/
+    #     files: ^litellm/
    -   id: isort
        name: isort
        entry: isort
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1426,6 +1426,7 @@ litellm_settings:
 ```shell
 DD_API_KEY="5f2d0f310***********" # your datadog API Key
 DD_SITE="us5.datadoghq.com"       # your datadog base url
 DD_SOURCE="litellm_dev"       # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments
 ```
 **Step 3**: Start the proxy, make a test request
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -2039,10 +2039,7 @@ class DualCache(BaseCache):
            return result
        except Exception as e:
-            verbose_logger.exception(
+            raise e  # don't log if exception is raised
                f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
            )
            raise e
    async def async_set_cache_sadd(
        self, key, value: List, local_only: bool = False, **kwargs
@ -2069,10 +2066,7 @@ class DualCache(BaseCache):
            return None
        except Exception as e:
-            verbose_logger.exception(
+            raise e  # don't log, if exception is raised
                "LiteLLM Cache: Excepton async set_cache_sadd: {}".format(str(e))
            )
            raise e
    def flush_cache(self):
        if self.in_memory_cache is not None:
@ -2543,7 +2537,6 @@ class Cache:
            self.cache.set_cache(cache_key, cached_data, **kwargs)
        except Exception as e:
            verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
            pass
    async def async_add_cache(self, result, *args, **kwargs):
        """
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -235,10 +235,7 @@ class BraintrustLogger(CustomLogger):
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
-            verbose_logger.exception(
+            raise e  # don't use verbose_logger.exception, if exception is raised
                "Error logging to braintrust - Exception received - {}".format(str(e))
            )
            raise e
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
@ -360,10 +357,7 @@ class BraintrustLogger(CustomLogger):
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
-            verbose_logger.exception(
+            raise e  # don't use verbose_logger.exception, if exception is raised
                "Error logging to braintrust - Exception received - {}".format(str(e))
            )
            raise e
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        return super().log_failure_event(kwargs, response_obj, start_time, end_time)
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@ -1,11 +1,17 @@
 #### What this does ####
 #    On success + failure, log events to Datadog
-import dotenv, os
+import datetime
-import requests  # type: ignore
+import os
 import subprocess
 import sys
 import traceback
-import datetime, subprocess, sys
+import uuid
-import litellm, uuid
+
 import dotenv
 import requests  # type: ignore
 import litellm
 from litellm._logging import print_verbose, verbose_logger
@ -57,9 +63,9 @@ class DataDogLogger:
    ):
        try:
            # Define DataDog client
            from datadog_api_client.v2.api.logs_api import LogsApi
            from datadog_api_client.v2 import ApiClient
-            from datadog_api_client.v2.models import HTTPLogItem, HTTPLog
+            from datadog_api_client.v2.api.logs_api import LogsApi
            from datadog_api_client.v2.models import HTTPLog, HTTPLogItem
            verbose_logger.debug(
                f"datadog Logging - Enters logging function for model {kwargs}"
@ -131,7 +137,7 @@ class DataDogLogger:
                body = HTTPLog(
                    [
                        HTTPLogItem(
-                            ddsource="litellm",
+                            ddsource=os.getenv("DD_SOURCE", "litellm"),
                            message=payload,
                            service="litellm-server",
                        ),
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@ -228,6 +228,54 @@ class AnthropicConfig:
        return False
    def translate_system_message(
        self, messages: List[AllMessageValues]
    ) -> List[AnthropicSystemMessageContent]:
        system_prompt_indices = []
        anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
        for idx, message in enumerate(messages):
            if message["role"] == "system":
                valid_content: bool = False
                system_message_block = ChatCompletionSystemMessage(**message)
                if isinstance(system_message_block["content"], str):
                    anthropic_system_message_content = AnthropicSystemMessageContent(
                        type="text",
                        text=system_message_block["content"],
                    )
                    if "cache_control" in system_message_block:
                        anthropic_system_message_content["cache_control"] = (
                            system_message_block["cache_control"]
                        )
                    anthropic_system_message_list.append(
                        anthropic_system_message_content
                    )
                    valid_content = True
                elif isinstance(message["content"], list):
                    for _content in message["content"]:
                        anthropic_system_message_content = (
                            AnthropicSystemMessageContent(
                                type=_content.get("type"),
                                text=_content.get("text"),
                            )
                        )
                        if "cache_control" in _content:
                            anthropic_system_message_content["cache_control"] = (
                                _content["cache_control"]
                            )
                        anthropic_system_message_list.append(
                            anthropic_system_message_content
                        )
                    valid_content = True
                if valid_content:
                    system_prompt_indices.append(idx)
        if len(system_prompt_indices) > 0:
            for idx in reversed(system_prompt_indices):
                messages.pop(idx)
        return anthropic_system_message_list
    ### FOR [BETA] `/v1/messages` endpoint support
    def translatable_anthropic_params(self) -> List:
@ -314,7 +362,7 @@ class AnthropicConfig:
                new_messages.append(user_message)
            if len(new_user_content_list) > 0:
-                new_messages.append({"role": "user", "content": new_user_content_list})
+                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
            if len(tool_message_list) > 0:
                new_messages.extend(tool_message_list)
@ -940,45 +988,11 @@ class AnthropicChatCompletion(BaseLLM):
            )
        else:
            # Separate system prompt from rest of message
-            system_prompt_indices = []
+            anthropic_system_message_list = AnthropicConfig().translate_system_message(
-            system_prompt = ""
+                messages=messages
-            anthropic_system_message_list = None
+            )
            for idx, message in enumerate(messages):
                if message["role"] == "system":
                    valid_content: bool = False
                    if isinstance(message["content"], str):
                        system_prompt += message["content"]
                        valid_content = True
                    elif isinstance(message["content"], list):
                        for _content in message["content"]:
                            anthropic_system_message_content = (
                                AnthropicSystemMessageContent(
                                    type=_content.get("type"),
                                    text=_content.get("text"),
                                )
                            )
                            if "cache_control" in _content:
                                anthropic_system_message_content["cache_control"] = (
                                    _content["cache_control"]
                                )
                            if anthropic_system_message_list is None:
                                anthropic_system_message_list = []
                            anthropic_system_message_list.append(
                                anthropic_system_message_content
                            )
                        valid_content = True
                    if valid_content:
                        system_prompt_indices.append(idx)
            if len(system_prompt_indices) > 0:
                for idx in reversed(system_prompt_indices):
                    messages.pop(idx)
            if len(system_prompt) > 0:
                optional_params["system"] = system_prompt
            # Handling anthropic API Prompt Caching
-            if anthropic_system_message_list is not None:
+            if len(anthropic_system_message_list) > 0:
                optional_params["system"] = anthropic_system_message_list
            # Format rest of message according to anthropic guidelines
            try:
@ -986,15 +1000,10 @@ class AnthropicChatCompletion(BaseLLM):
                    model=model, messages=messages, custom_llm_provider="anthropic"
                )
            except Exception as e:
                verbose_logger.exception(
                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
                        str(e), messages
                    )
                )
                raise AnthropicError(
                    status_code=400,
                    message="{}\nReceived Messages={}".format(str(e), messages),
-                )
+                )  # don't use verbose_logger.exception, if exception is raised
        ## Load Config
        config = litellm.AnthropicConfig.get_config()
--- a/litellm/llms/base_aws_llm.py
+++ b/litellm/llms/base_aws_llm.py
@ -119,8 +119,6 @@ class BaseAWSLLM(BaseLLM):
                    "aws_web_identity_token": aws_web_identity_token,
                    "aws_role_name": aws_role_name,
                    "aws_session_name": aws_session_name,
                    "aws_region_name": aws_region_name,
                    "aws_sts_endpoint": sts_endpoint,
                }
            )
@ -147,6 +145,7 @@ class BaseAWSLLM(BaseLLM):
                    RoleSessionName=aws_session_name,
                    WebIdentityToken=oidc_token,
                    DurationSeconds=3600,
                    Policy='{"Version":"2012-10-17","Statement":[{"Sid":"BedrockLiteLLM","Effect":"Allow","Action":["bedrock:InvokeModel","bedrock:InvokeModelWithResponseStream"],"Resource":"*","Condition":{"Bool":{"aws:SecureTransport":"true"},"StringLike":{"aws:UserAgent":"litellm/*"}}}]}',
                )
                iam_creds_dict = {
@ -164,6 +163,11 @@ class BaseAWSLLM(BaseLLM):
                    ttl=3600 - 60,
                )
                if sts_response["PackedPolicySize"] > 75:
                    verbose_logger.warning(
                        f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
                    )
            session = boto3.Session(**iam_creds_dict)
            iam_creds = session.get_credentials()
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -423,13 +423,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                async for transformed_chunk in streamwrapper:
                    yield transformed_chunk
    except Exception as e:
-        verbose_logger.exception(
+        raise e  # don't use verbose_logger.exception, if exception is raised
            "LiteLLM.ollama.py::ollama_async_streaming(): Exception occured - {}".format(
                str(e)
            )
        )
        raise e
 async def ollama_acompletion(
@ -498,12 +492,7 @@ async def ollama_acompletion(
            )
            return model_response
    except Exception as e:
-        verbose_logger.exception(
+        raise e  # don't use verbose_logger.exception, if exception is raised
            "LiteLLM.ollama.py::ollama_acompletion(): Exception occured - {}".format(
                str(e)
            )
        )
        raise e
 async def ollama_aembeddings(
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -583,8 +583,4 @@ async def ollama_acompletion(
            )
            return model_response
    except Exception as e:
-        verbose_logger.exception(
+        raise e  # don't use verbose_logger.exception, if exception is raised
            "LiteLLM.ollama_acompletion(): Exception occured - {}".format(str(e))
        )
        raise e
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -168,9 +168,6 @@ def completion(
            choices_list.append(choice_obj)
        model_response.choices = choices_list  # type: ignore
    except Exception as e:
        verbose_logger.exception(
            "litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
        )
        raise PalmError(
            message=traceback.format_exc(), status_code=response.status_code
        )
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -564,12 +564,9 @@ class PredibaseChatCompletion(BaseLLM):
            for exception in litellm.LITELLM_EXCEPTION_TYPES:
                if isinstance(e, exception):
                    raise e
-            verbose_logger.exception(
+            raise PredibaseError(
-                "litellm.llms.predibase.py::async_completion() - Exception occurred - {}".format(
+                status_code=500, message="{}".format(str(e))
-                    str(e)
+            )  # don't use verbose_logger.exception, if exception is raised
                )
            )
            raise PredibaseError(status_code=500, message="{}".format(str(e)))
        return self.process_response(
            model=model,
            response=response,
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -27,10 +27,13 @@ from litellm.types.completion import (
 from litellm.types.llms.anthropic import *
 from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
 from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionAssistantMessage,
    ChatCompletionAssistantToolCall,
    ChatCompletionFunctionMessage,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionToolMessage,
    ChatCompletionUserMessage,
 )
 from litellm.types.utils import GenericImageParsingChunk
@ -493,10 +496,9 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =
        return rendered_text
    except Exception as e:
-        verbose_logger.exception(
+        raise Exception(
-            "Error rendering huggingface chat template - {}".format(str(e))
+            f"Error rendering template - {str(e)}"
-        )
+        )  # don't use verbose_logger.exception, if exception is raised
        raise Exception(f"Error rendering template - {str(e)}")
 # Anthropic template
@ -1171,7 +1173,9 @@ def convert_to_gemini_tool_call_result(
    return _part
-def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResultParam:
+def convert_to_anthropic_tool_result(
    message: Union[dict, ChatCompletionToolMessage, ChatCompletionFunctionMessage]
 ) -> AnthropicMessagesToolResultParam:
    """
    OpenAI message with a tool result looks like:
    {
@ -1215,7 +1219,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
        return anthropic_tool_result
    if message["role"] == "function":
        content = message.get("content")  # type: ignore
-        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())
+        tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())  # type: ignore
        anthropic_tool_result = AnthropicMessagesToolResultParam(
            type="tool_result", tool_use_id=tool_call_id, content=content
        )
@ -1230,7 +1234,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
 def convert_function_to_anthropic_tool_invoke(
-    function_call,
+    function_call: Union[dict, ChatCompletionToolCallFunctionChunk],
 ) -> List[AnthropicMessagesToolUseParam]:
    try:
        anthropic_tool_invoke = [
@ -1247,7 +1251,7 @@ def convert_function_to_anthropic_tool_invoke(
 def convert_to_anthropic_tool_invoke(
-    tool_calls: list,
+    tool_calls: List[ChatCompletionAssistantToolCall],
 ) -> List[AnthropicMessagesToolUseParam]:
    """
    OpenAI tool invokes:
@ -1307,17 +1311,19 @@ def add_cache_control_to_content(
    anthropic_content_element: Union[
        dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
    ],
-    orignal_content_element: dict,
+    orignal_content_element: Union[dict, AllMessageValues],
 ):
-    if "cache_control" in orignal_content_element:
+    cache_control_param = orignal_content_element.get("cache_control")
-        anthropic_content_element["cache_control"] = orignal_content_element[
+    if cache_control_param is not None and isinstance(cache_control_param, dict):
-            "cache_control"
+        transformed_param = ChatCompletionCachedContent(**cache_control_param)  # type: ignore
-        ]
+
        anthropic_content_element["cache_control"] = transformed_param
    return anthropic_content_element
 def anthropic_messages_pt(
-    messages: list,
+    messages: List[AllMessageValues],
    model: str,
    llm_provider: str,
 ) -> List[
@ -1348,10 +1354,21 @@ def anthropic_messages_pt(
    while msg_i < len(messages):
        user_content: List[AnthropicMessagesUserMessageValues] = []
        init_msg_i = msg_i
        if isinstance(messages[msg_i], BaseModel):
            messages[msg_i] = dict(messages[msg_i])  # type: ignore
        ## MERGE CONSECUTIVE USER CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
-            if isinstance(messages[msg_i]["content"], list):
+            user_message_types_block: Union[
-                for m in messages[msg_i]["content"]:
+                ChatCompletionToolMessage,
                ChatCompletionUserMessage,
                ChatCompletionFunctionMessage,
            ] = messages[
                msg_i
            ]  # type: ignore
            if user_message_types_block["content"] and isinstance(
                user_message_types_block["content"], list
            ):
                for m in user_message_types_block["content"]:
                    if m.get("type", "") == "image_url":
                        image_chunk = convert_to_anthropic_image_obj(
                            m["image_url"]["url"]
@ -1382,15 +1399,24 @@ def anthropic_messages_pt(
                        )
                        user_content.append(anthropic_content_element)
            elif (
-                messages[msg_i]["role"] == "tool"
+                user_message_types_block["role"] == "tool"
-                or messages[msg_i]["role"] == "function"
+                or user_message_types_block["role"] == "function"
            ):
                # OpenAI's tool message content will always be a string
                user_content.append(convert_to_anthropic_tool_result(messages[msg_i]))
            else:
                user_content.append(
-                    {"type": "text", "text": messages[msg_i]["content"]}
+                    convert_to_anthropic_tool_result(user_message_types_block)
                )
            elif isinstance(user_message_types_block["content"], str):
                _anthropic_content_text_element: AnthropicMessagesTextParam = {
                    "type": "text",
                    "text": user_message_types_block["content"],
                }
                anthropic_content_element = add_cache_control_to_content(
                    anthropic_content_element=_anthropic_content_text_element,
                    orignal_content_element=user_message_types_block,
                )
                user_content.append(anthropic_content_element)
            msg_i += 1
@ -1400,10 +1426,11 @@ def anthropic_messages_pt(
        assistant_content: List[AnthropicMessagesAssistantMessageValues] = []
        ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
-            if "content" in messages[msg_i] and isinstance(
+            assistant_content_block: ChatCompletionAssistantMessage = messages[msg_i]  # type: ignore
-                messages[msg_i]["content"], list
+            if "content" in assistant_content_block and isinstance(
                assistant_content_block["content"], list
            ):
-                for m in messages[msg_i]["content"]:
+                for m in assistant_content_block["content"]:
                    # handle text
                    if (
                        m.get("type", "") == "text" and len(m.get("text", "")) > 0
@ -1417,35 +1444,37 @@ def anthropic_messages_pt(
                        )
                        assistant_content.append(anthropic_message)
            elif (
-                "content" in messages[msg_i]
+                "content" in assistant_content_block
-                and isinstance(messages[msg_i]["content"], str)
+                and isinstance(assistant_content_block["content"], str)
-                and len(messages[msg_i]["content"])
+                and assistant_content_block[
-                > 0  # don't pass empty text blocks. anthropic api raises errors.
+                    "content"
                ]  # don't pass empty text blocks. anthropic api raises errors.
            ):
                _anthropic_text_content_element = {
                    "type": "text",
-                    "text": messages[msg_i]["content"],
+                    "text": assistant_content_block["content"],
                }
                anthropic_content_element = add_cache_control_to_content(
                    anthropic_content_element=_anthropic_text_content_element,
-                    orignal_content_element=messages[msg_i],
+                    orignal_content_element=assistant_content_block,
                )
                assistant_content.append(anthropic_content_element)
-            if messages[msg_i].get(
+            assistant_tool_calls = assistant_content_block.get("tool_calls")
-                "tool_calls", []
+            if (
                assistant_tool_calls is not None
            ):  # support assistant tool invoke conversion
                assistant_content.extend(
-                    convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"])
+                    convert_to_anthropic_tool_invoke(assistant_tool_calls)
                )
-            if messages[msg_i].get("function_call"):
+            assistant_function_call = assistant_content_block.get("function_call")
            if assistant_function_call is not None:
                assistant_content.extend(
-                    convert_function_to_anthropic_tool_invoke(
+                    convert_function_to_anthropic_tool_invoke(assistant_function_call)
                        messages[msg_i]["function_call"]
                    )
                )
            msg_i += 1
--- a/litellm/llms/text_completion_codestral.py
+++ b/litellm/llms/text_completion_codestral.py
@ -491,14 +491,9 @@ class CodestralTextCompletion(BaseLLM):
                message="HTTPStatusError - {}".format(e.response.text),
            )
        except Exception as e:
            verbose_logger.exception(
                "litellm.llms.text_completion_codestral.py::async_completion() - Exception occurred - {}".format(
                    str(e)
                )
            )
            raise TextCompletionCodestralError(
                status_code=500, message="{}".format(str(e))
-            )
+            )  # don't use verbose_logger.exception, if exception is raised
        return self.process_text_completion_response(
            model=model,
            response=response,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -445,9 +445,6 @@ async def acompletion(
            )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
        return response
    except Exception as e:
        verbose_logger.exception(
            "litellm.main.py::acompletion() - Exception occurred - {}".format(str(e))
        )
        custom_llm_provider = custom_llm_provider or "openai"
        raise exception_type(
            model=model,
@ -616,9 +613,6 @@ def mock_completion(
    except Exception as e:
        if isinstance(e, openai.APIError):
            raise e
        verbose_logger.exception(
            "litellm.mock_completion(): Exception occured - {}".format(str(e))
        )
        raise Exception("Mock completion response failed")
@ -5125,9 +5119,6 @@ async def ahealth_check(
                response = {}  # args like remaining ratelimit etc.
        return response
    except Exception as e:
        verbose_logger.exception(
            "litellm.ahealth_check(): Exception occured - {}".format(str(e))
        )
        stack_trace = traceback.format_exc()
        if isinstance(stack_trace, str):
            stack_trace = stack_trace[:1000]
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,6 +1,16 @@
 model_list:
-  - model_name: "whisper"
+- model_name: gpt-4o-mini-2024-07-18
-    litellm_params:
+  litellm_params:
-      model: "azure/azure-whisper"
+    api_key: API_KEY
-      api_key: os.environ/AZURE_EUROPE_API_KEY
+    model: openai/gpt-4o-mini-2024-07-18
-      api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
+    rpm: 0
    tpm: 100
 router_settings:
  num_retries: 0
  routing_strategy: usage-based-routing-v2
  timeout: 10
 litellm_settings:
  callbacks: custom_callbacks.proxy_handler_instance
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -386,7 +386,6 @@ async def user_api_key_auth(
                    parent_otel_span=parent_otel_span,
                )
        #### ELSE ####
        ## CHECK PASS-THROUGH ENDPOINTS ##
        if pass_through_endpoints is not None:
            for endpoint in pass_through_endpoints:
--- a/litellm/proxy/custom_callbacks.py
+++ b/litellm/proxy/custom_callbacks.py
@ -1,66 +1,10 @@
 from litellm.integrations.custom_logger import CustomLogger
 import litellm
 # This file includes the custom callbacks for LiteLLM Proxy
 # Once defined, these can be passed in proxy_config.yaml
 class MyCustomHandler(CustomLogger):
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")  # noqa
    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        print(f"Post-API Call")  # noqa
    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Stream")  # noqa
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print("On Success")  # noqa
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Failure")  # noqa
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"ishaan async_log_success_event")  # noqa
        # log: key, user, model, prompt, response, tokens, cost
        # Access kwargs passed to litellm.completion()
        model = kwargs.get("model", None)
        messages = kwargs.get("messages", None)
        user = kwargs.get("user", None)
        # Access litellm_params passed to litellm.completion(), example access `metadata`
        litellm_params = kwargs.get("litellm_params", {})
        metadata = litellm_params.get(
            "metadata", {}
        )  # headers passed to LiteLLM proxy, can be found here
        return
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        try:
+        # print("Call failed")
-            print(f"On Async Failure !")  # noqa
+        pass
            print("\nkwargs", kwargs)  # noqa
            # Access kwargs passed to litellm.completion()
            model = kwargs.get("model", None)
            messages = kwargs.get("messages", None)
            user = kwargs.get("user", None)
            # Access litellm_params passed to litellm.completion(), example access `metadata`
            litellm_params = kwargs.get("litellm_params", {})
            metadata = litellm_params.get(
                "metadata", {}
            )  # headers passed to LiteLLM proxy, can be found here
            # Acess Exceptions & Traceback
            exception_event = kwargs.get("exception", None)
            traceback_event = kwargs.get("traceback_exception", None)
            # Calculate cost using  litellm.completion_cost()
        except Exception as e:
            print(f"Exception: {e}")  # noqa
 proxy_handler_instance = MyCustomHandler()
 # Set litellm.callbacks = [proxy_handler_instance] on the proxy
 # need to set litellm.callbacks = [proxy_handler_instance] # on the proxy
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -6183,6 +6183,64 @@ async def delete_end_user(
    pass
@router.get(
    "/customer/list",
    tags=["Customer Management"],
    dependencies=[Depends(user_api_key_auth)],
    response_model=List[LiteLLM_EndUserTable],
 )
@router.get(
    "/end_user/list",
    tags=["Customer Management"],
    include_in_schema=False,
    dependencies=[Depends(user_api_key_auth)],
 )
 async def list_team(
    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [Admin-only] List all available customers
    ```
    curl --location --request GET 'http://0.0.0.0:4000/customer/list' \
        --header 'Authorization: Bearer sk-1234'
    ```
    """
    from litellm.proxy.proxy_server import (
        _duration_in_seconds,
        create_audit_log_for_update,
        litellm_proxy_admin_name,
        prisma_client,
    )
    if (
        user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
        and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
    ):
        raise HTTPException(
            status_code=401,
            detail={
                "error": "Admin-only endpoint. Your user role={}".format(
                    user_api_key_dict.user_role
                )
            },
        )
    if prisma_client is None:
        raise HTTPException(
            status_code=400,
            detail={"error": CommonProxyErrors.db_not_connected_error.value},
        )
    response = await prisma_client.db.litellm_endusertable.find_many()
    returned_response: List[LiteLLM_EndUserTable] = []
    for item in response:
        returned_response.append(LiteLLM_EndUserTable(**item.model_dump()))
    return returned_response
 async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
    if premium_user is not True:
        return
--- a/litellm/router.py
+++ b/litellm/router.py
@ -47,6 +47,7 @@ from litellm._logging import verbose_router_logger
 from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.azure import get_azure_ad_token_from_oidc
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
@ -783,6 +784,10 @@ class Router:
                }
            )
            logging_obj: Optional[LiteLLMLogging] = kwargs.get(
                "litellm_logging_obj", None
            )
            rpm_semaphore = self._get_client(
                deployment=deployment,
                kwargs=kwargs,
@ -797,11 +802,13 @@ class Router:
                    - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
                    """
                    await self.async_routing_strategy_pre_call_checks(
-                        deployment=deployment
+                        deployment=deployment, logging_obj=logging_obj
                    )
                    response = await _response
            else:
-                await self.async_routing_strategy_pre_call_checks(deployment=deployment)
+                await self.async_routing_strategy_pre_call_checks(
                    deployment=deployment, logging_obj=logging_obj
                )
                response = await _response
            ## CHECK CONTENT FILTER ERROR ##
@ -3860,7 +3867,9 @@ class Router:
            if isinstance(_callback, CustomLogger):
                response = _callback.pre_call_check(deployment)
-    async def async_routing_strategy_pre_call_checks(self, deployment: dict):
+    async def async_routing_strategy_pre_call_checks(
        self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
    ):
        """
        For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.
@ -3875,8 +3884,22 @@ class Router:
        for _callback in litellm.callbacks:
            if isinstance(_callback, CustomLogger):
                try:
-                    response = await _callback.async_pre_call_check(deployment)
+                    _ = await _callback.async_pre_call_check(deployment)
                except litellm.RateLimitError as e:
                    ## LOG FAILURE EVENT
                    if logging_obj is not None:
                        asyncio.create_task(
                            logging_obj.async_failure_handler(
                                exception=e,
                                traceback_exception=traceback.format_exc(),
                                end_time=time.time(),
                            )
                        )
                        ## LOGGING
                        threading.Thread(
                            target=logging_obj.failure_handler,
                            args=(e, traceback.format_exc()),
                        ).start()  # log response
                    self._set_cooldown_deployments(
                        exception_status=e.status_code,
                        original_exception=e,
@ -3885,6 +3908,20 @@ class Router:
                    )
                    raise e
                except Exception as e:
                    ## LOG FAILURE EVENT
                    if logging_obj is not None:
                        asyncio.create_task(
                            logging_obj.async_failure_handler(
                                exception=e,
                                traceback_exception=traceback.format_exc(),
                                end_time=time.time(),
                            )
                        )
                        ## LOGGING
                        threading.Thread(
                            target=logging_obj.failure_handler,
                            args=(e, traceback.format_exc()),
                        ).start()  # log response
                    raise e
    def _generate_model_id(self, model_group: str, litellm_params: dict):
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@ -222,6 +222,94 @@ async def test_anthropic_api_prompt_caching_basic():
    )
@pytest.mark.asyncio()
 async def test_anthropic_api_prompt_caching_with_content_str():
    from litellm.llms.prompt_templates.factory import anthropic_messages_pt
    system_message = [
        {
            "role": "system",
            "content": "Here is the full text of a complex legal agreement",
            "cache_control": {"type": "ephemeral"},
        },
    ]
    translated_system_message = litellm.AnthropicConfig().translate_system_message(
        messages=system_message
    )
    assert translated_system_message == [
        # System Message
        {
            "type": "text",
            "text": "Here is the full text of a complex legal agreement",
            "cache_control": {"type": "ephemeral"},
        }
    ]
    user_messages = [
        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
        {
            "role": "user",
            "content": "What are the key terms and conditions in this agreement?",
            "cache_control": {"type": "ephemeral"},
        },
        {
            "role": "assistant",
            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
        },
        # The final turn is marked with cache-control, for continuing in followups.
        {
            "role": "user",
            "content": "What are the key terms and conditions in this agreement?",
            "cache_control": {"type": "ephemeral"},
        },
    ]
    translated_messages = anthropic_messages_pt(
        messages=user_messages,
        model="claude-3-5-sonnet-20240620",
        llm_provider="anthropic",
    )
    expected_messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
                }
            ],
        },
        # The final turn is marked with cache-control, for continuing in followups.
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
    ]
    assert len(translated_messages) == len(expected_messages)
    for idx, i in enumerate(translated_messages):
        assert (
            i == expected_messages[idx]
        ), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx])
@pytest.mark.asyncio()
 async def test_anthropic_api_prompt_caching_no_headers():
    litellm.set_verbose = True
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@ -616,8 +616,8 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth():
            aws_region_name=aws_region_name,
            aws_web_identity_token=aws_web_identity_token,
            aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
+            aws_session_name="cross-region-test",
-            aws_sts_endpoint="https://sts-fips.us-west-2.amazonaws.com",
+            aws_sts_endpoint="https://sts-fips.us-east-2.amazonaws.com",
            aws_bedrock_runtime_endpoint="https://bedrock-runtime-fips.us-west-2.amazonaws.com",
        )
        # Add any assertions here to check the response
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -3,6 +3,8 @@ from typing import Any, Dict, Iterable, List, Optional, Union
 from pydantic import BaseModel, validator
 from typing_extensions import Literal, Required, TypedDict
 from .openai import ChatCompletionCachedContent
 class AnthropicMessagesToolChoice(TypedDict, total=False):
    type: Required[Literal["auto", "any", "tool"]]
@ -18,7 +20,7 @@ class AnthropicMessagesTool(TypedDict, total=False):
 class AnthropicMessagesTextParam(TypedDict, total=False):
    type: Literal["text"]
    text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 class AnthropicMessagesToolUseParam(TypedDict):
@ -58,7 +60,7 @@ class AnthropicImageParamSource(TypedDict):
 class AnthropicMessagesImageParam(TypedDict, total=False):
    type: Literal["image"]
    source: AnthropicImageParamSource
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 class AnthropicMessagesToolResultContent(TypedDict):
@ -97,7 +99,7 @@ class AnthropicMetadata(TypedDict, total=False):
 class AnthropicSystemMessageContent(TypedDict, total=False):
    type: str
    text: str
-    cache_control: Optional[dict]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 class AnthropicMessagesRequest(TypedDict, total=False):
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -354,14 +354,18 @@ class ChatCompletionImageObject(TypedDict):
    image_url: ChatCompletionImageUrlObject
-class ChatCompletionUserMessage(TypedDict):
+class OpenAIChatCompletionUserMessage(TypedDict):
    role: Literal["user"]
    content: Union[
        str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
    ]
-class ChatCompletionAssistantMessage(TypedDict, total=False):
+class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
    cache_control: ChatCompletionCachedContent
 class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
    role: Required[Literal["assistant"]]
    content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
    name: Optional[str]
@ -369,6 +373,10 @@ class ChatCompletionAssistantMessage(TypedDict, total=False):
    function_call: Optional[ChatCompletionToolCallFunctionChunk]
 class ChatCompletionAssistantMessage(OpenAIChatCompletionAssistantMessage, total=False):
    cache_control: ChatCompletionCachedContent
 class ChatCompletionToolMessage(TypedDict):
    role: Literal["tool"]
    content: str
@ -381,12 +389,16 @@ class ChatCompletionFunctionMessage(TypedDict):
    name: str
-class ChatCompletionSystemMessage(TypedDict, total=False):
+class OpenAIChatCompletionSystemMessage(TypedDict, total=False):
    role: Required[Literal["system"]]
    content: Required[Union[str, List]]
    name: str
 class ChatCompletionSystemMessage(OpenAIChatCompletionSystemMessage, total=False):
    cache_control: ChatCompletionCachedContent
 AllMessageValues = Union[
    ChatCompletionUserMessage,
    ChatCompletionAssistantMessage,
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -8547,11 +8547,6 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }
        except Exception as e:
            verbose_logger.exception(
                "litellm.CustomStreamWrapper.handle_predibase_chunk(): Exception occured - {}".format(
                    str(e)
                )
            )
            raise e
    def handle_huggingface_chunk(self, chunk):
@ -8595,11 +8590,6 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }
        except Exception as e:
            verbose_logger.exception(
                "litellm.CustomStreamWrapper.handle_huggingface_chunk(): Exception occured - {}".format(
                    str(e)
                )
            )
            raise e
    def handle_ai21_chunk(self, chunk):  # fake streaming
@ -8826,11 +8816,6 @@ class CustomStreamWrapper:
                "usage": usage,
            }
        except Exception as e:
            verbose_logger.exception(
                "litellm.CustomStreamWrapper.handle_openai_chat_completion_chunk(): Exception occured - {}".format(
                    str(e)
                )
            )
            raise e
    def handle_azure_text_completion_chunk(self, chunk):