diff --git a/litellm/__init__.py b/litellm/__init__.py
index f95640b58..d76dd37bc 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -89,6 +89,7 @@ retry = True
 ### AUTH ###
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
+groq_key: Optional[str] = None
 databricks_key: Optional[str] = None
 azure_key: Optional[str] = None
 anthropic_key: Optional[str] = None
@@ -892,7 +893,11 @@ ALL_LITELLM_RESPONSE_TYPES = [
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
-from .llms.anthropic.chat import AnthropicConfig
+from .llms.anthropic.chat.handler import AnthropicConfig
+from .llms.anthropic.experimental_pass_through.transformation import (
+    AnthropicExperimentalPassThroughConfig,
+)
+from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion import AnthropicTextConfig
 from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
@@ -962,8 +967,8 @@ from .llms.OpenAI.openai import (
     OpenAITextCompletionConfig,
     MistralEmbeddingConfig,
     DeepInfraConfig,
-    GroqConfig,
 )
+from .llms.groq.chat.transformation import GroqChatConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.OpenAI.chat.o1_transformation import (
diff --git a/litellm/adapters/anthropic_adapter.py b/litellm/adapters/anthropic_adapter.py
index 1bff003be..47fba3630 100644
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@@ -34,7 +34,7 @@ class AnthropicAdapter(CustomLogger):
         """
         request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
 
-        translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
+        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
             anthropic_message_request=request_body
         )
 
@@ -44,7 +44,7 @@ class AnthropicAdapter(CustomLogger):
         self, response: litellm.ModelResponse
     ) -> Optional[AnthropicResponse]:
 
-        return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
+        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
             response=response
         )
 
@@ -99,7 +99,7 @@ class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
                 if chunk == "None" or chunk is None:
                     raise Exception
 
-                processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
+                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                     response=chunk
                 )
                 if (
@@ -163,7 +163,7 @@ class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
             async for chunk in self.completion_stream:
                 if chunk == "None" or chunk is None:
                     raise Exception
-                processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
+                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                     response=chunk
                 )
                 if (
diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index d2343d429..748c904a3 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -601,7 +601,7 @@ class LangFuseLogger:
                 "input": input if not mask_input else "redacted-by-litellm",
                 "output": output if not mask_output else "redacted-by-litellm",
                 "usage": usage,
-                "metadata": clean_metadata,
+                "metadata": log_requester_metadata(clean_metadata),
                 "level": level,
                 "version": clean_metadata.pop("version", None),
             }
@@ -768,3 +768,15 @@ def log_provider_specific_information_as_span(
                 name="vertex_ai_grounding_metadata",
                 input=vertex_ai_grounding_metadata,
             )
+
+
+def log_requester_metadata(clean_metadata: dict):
+    returned_metadata = {}
+    requester_metadata = clean_metadata.get("requester_metadata") or {}
+    for k, v in clean_metadata.items():
+        if k not in requester_metadata:
+            returned_metadata[k] = v
+
+    returned_metadata.update({"requester_metadata": requester_metadata})
+
+    return returned_metadata
diff --git a/litellm/litellm_core_utils/exception_mapping_utils.py b/litellm/litellm_core_utils/exception_mapping_utils.py
index 5ac26c7ae..b1b378f43 100644
--- a/litellm/litellm_core_utils/exception_mapping_utils.py
+++ b/litellm/litellm_core_utils/exception_mapping_utils.py
@@ -1,6 +1,32 @@
 import json
+import os
+import threading
+import traceback
 from typing import Optional
 
+import httpx
+
+import litellm
+from litellm import verbose_logger
+
+from ..exceptions import (
+    APIConnectionError,
+    APIError,
+    AuthenticationError,
+    BadRequestError,
+    BudgetExceededError,
+    ContentPolicyViolationError,
+    ContextWindowExceededError,
+    NotFoundError,
+    OpenAIError,
+    PermissionDeniedError,
+    RateLimitError,
+    ServiceUnavailableError,
+    Timeout,
+    UnprocessableEntityError,
+    UnsupportedParamsError,
+)
+
 
 def get_error_message(error_obj) -> Optional[str]:
     """
@@ -38,3 +64,2015 @@ def get_error_message(error_obj) -> Optional[str]:
         return None
     except Exception as e:
         return None
+
+
+####### EXCEPTION MAPPING ################
+def _get_litellm_response_headers(
+    original_exception: Exception,
+) -> Optional[httpx.Headers]:
+    """
+    Extract and return the response headers from a mapped exception, if present.
+
+    Used for accurate retry logic.
+    """
+    _response_headers: Optional[httpx.Headers] = None
+    try:
+        _response_headers = getattr(
+            original_exception, "litellm_response_headers", None
+        )
+    except Exception:
+        return None
+
+    return _response_headers
+
+
+def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
+    """
+    Extract and return the response headers from an exception, if present.
+
+    Used for accurate retry logic.
+    """
+    _response_headers: Optional[httpx.Headers] = None
+    try:
+        _response_headers = getattr(original_exception, "headers", None)
+        error_response = getattr(original_exception, "response", None)
+        if _response_headers is None and error_response:
+            _response_headers = getattr(error_response, "headers", None)
+    except Exception:
+        return None
+
+    return _response_headers
+
+
+def exception_type(  # type: ignore
+    model,
+    original_exception,
+    custom_llm_provider,
+    completion_kwargs={},
+    extra_kwargs={},
+):
+
+    if any(
+        isinstance(original_exception, exc_type)
+        for exc_type in litellm.LITELLM_EXCEPTION_TYPES
+    ):
+        return original_exception
+    exception_mapping_worked = False
+    exception_provider = custom_llm_provider
+    if litellm.suppress_debug_info is False:
+        print()  # noqa
+        print(  # noqa
+            "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
+        )  # noqa
+        print(  # noqa
+            "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
+        )  # noqa
+        print()  # noqa
+
+    litellm_response_headers = _get_response_headers(
+        original_exception=original_exception
+    )
+    try:
+        if model:
+            if hasattr(original_exception, "message"):
+                error_str = str(original_exception.message)
+            else:
+                error_str = str(original_exception)
+            if isinstance(original_exception, BaseException):
+                exception_type = type(original_exception).__name__
+            else:
+                exception_type = ""
+
+            ################################################################################
+            # Common Extra information needed for all providers
+            # We pass num retries, api_base, vertex_deployment etc to the exception here
+            ################################################################################
+            extra_information = ""
+            try:
+                _api_base = litellm.get_api_base(
+                    model=model, optional_params=extra_kwargs
+                )
+                messages = litellm.get_first_chars_messages(kwargs=completion_kwargs)
+                _vertex_project = extra_kwargs.get("vertex_project")
+                _vertex_location = extra_kwargs.get("vertex_location")
+                _metadata = extra_kwargs.get("metadata", {}) or {}
+                _model_group = _metadata.get("model_group")
+                _deployment = _metadata.get("deployment")
+                extra_information = f"\nModel: {model}"
+
+                if (
+                    isinstance(custom_llm_provider, str)
+                    and len(custom_llm_provider) > 0
+                ):
+                    exception_provider = (
+                        custom_llm_provider[0].upper()
+                        + custom_llm_provider[1:]
+                        + "Exception"
+                    )
+
+                if _api_base:
+                    extra_information += f"\nAPI Base: `{_api_base}`"
+                if (
+                    messages
+                    and len(messages) > 0
+                    and litellm.redact_messages_in_exceptions is False
+                ):
+                    extra_information += f"\nMessages: `{messages}`"
+
+                if _model_group is not None:
+                    extra_information += f"\nmodel_group: `{_model_group}`\n"
+                if _deployment is not None:
+                    extra_information += f"\ndeployment: `{_deployment}`\n"
+                if _vertex_project is not None:
+                    extra_information += f"\nvertex_project: `{_vertex_project}`\n"
+                if _vertex_location is not None:
+                    extra_information += f"\nvertex_location: `{_vertex_location}`\n"
+
+                # on litellm proxy add key name + team to exceptions
+                extra_information = _add_key_name_and_team_to_alert(
+                    request_info=extra_information, metadata=_metadata
+                )
+            except Exception:
+                # DO NOT LET this Block raising the original exception
+                pass
+
+            ################################################################################
+            # End of Common Extra information Needed for all providers
+            ################################################################################
+
+            ################################################################################
+            #################### Start of Provider Exception mapping ####################
+            ################################################################################
+
+            if "Request Timeout Error" in error_str or "Request timed out" in error_str:
+                exception_mapping_worked = True
+                raise Timeout(
+                    message=f"APITimeoutError - Request timed out. \nerror_str: {error_str}",
+                    model=model,
+                    llm_provider=custom_llm_provider,
+                    litellm_debug_info=extra_information,
+                )
+
+            if (
+                custom_llm_provider == "openai"
+                or custom_llm_provider == "text-completion-openai"
+                or custom_llm_provider == "custom_openai"
+                or custom_llm_provider in litellm.openai_compatible_providers
+            ):
+                # custom_llm_provider is openai, make it OpenAI
+                message = get_error_message(error_obj=original_exception)
+                if message is None:
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+
+                if message is not None and isinstance(
+                    message, str
+                ):  # done to prevent user-confusion. Relevant issue - https://github.com/BerriAI/litellm/issues/1414
+                    message = message.replace("OPENAI", custom_llm_provider.upper())
+                    message = message.replace(
+                        "openai.OpenAIError",
+                        "{}.{}Error".format(custom_llm_provider, custom_llm_provider),
+                    )
+                if custom_llm_provider == "openai":
+                    exception_provider = "OpenAI" + "Exception"
+                else:
+                    exception_provider = (
+                        custom_llm_provider[0].upper()
+                        + custom_llm_provider[1:]
+                        + "Exception"
+                    )
+
+                if (
+                    "This model's maximum context length is" in error_str
+                    or "string too long. Expected a string with maximum length"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"ContextWindowExceededError: {exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "invalid_request_error" in error_str
+                    and "model_not_found" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise NotFoundError(
+                        message=f"{exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "A timeout occurred" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"{exception_provider} - {message}",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "invalid_request_error" in error_str
+                    and "content_policy_violation" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContentPolicyViolationError(
+                        message=f"ContentPolicyViolationError: {exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "invalid_request_error" in error_str
+                    and "Incorrect API key provided" not in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"{exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "Web server is returning an unknown error" in error_str:
+                    exception_mapping_worked = True
+                    raise litellm.InternalServerError(
+                        message=f"{exception_provider} - {message}",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                    )
+                elif "Request too large" in error_str:
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"RateLimitError: {exception_provider} - {message}",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"AuthenticationError: {exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "Mistral API raised a streaming error" in error_str:
+                    exception_mapping_worked = True
+                    _request = httpx.Request(
+                        method="POST", url="https://api.openai.com/v1"
+                    )
+                    raise APIError(
+                        status_code=500,
+                        message=f"{exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        request=_request,
+                        litellm_debug_info=extra_information,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    exception_mapping_worked = True
+                    if original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AuthenticationError: {exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"NotFoundError: {exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"Timeout Error: {exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=getattr(original_exception, "response", None),
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"RateLimitError: {exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=getattr(original_exception, "response", None),
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"ServiceUnavailableError: {exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=getattr(original_exception, "response", None),
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"Timeout Error: {exception_provider} - {message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"APIError: {exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=getattr(original_exception, "request", None),
+                            litellm_debug_info=extra_information,
+                        )
+                else:
+                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                    # exception_mapping_worked = True
+                    raise APIConnectionError(
+                        message=f"APIConnectionError: {exception_provider} - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        litellm_debug_info=extra_information,
+                        request=httpx.Request(
+                            method="POST", url="https://api.openai.com/v1/"
+                        ),
+                    )
+            elif custom_llm_provider == "anthropic":  # one of the anthropics
+                if "prompt is too long" in error_str or "prompt: length" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message="AnthropicError - {}".format(error_str),
+                        model=model,
+                        llm_provider="anthropic",
+                    )
+                if "Invalid API Key" in error_str:
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message="AnthropicError - {}".format(error_str),
+                        model=model,
+                        llm_provider="anthropic",
+                    )
+                if "content filtering policy" in error_str:
+                    exception_mapping_worked = True
+                    raise ContentPolicyViolationError(
+                        message="AnthropicError - {}".format(error_str),
+                        model=model,
+                        llm_provider="anthropic",
+                    )
+                if "Client error '400 Bad Request'" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message="AnthropicError - {}".format(error_str),
+                        model=model,
+                        llm_provider="anthropic",
+                    )
+                if hasattr(original_exception, "status_code"):
+                    verbose_logger.debug(
+                        f"status_code: {original_exception.status_code}"
+                    )
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AnthropicException - {error_str}",
+                            llm_provider="anthropic",
+                            model=model,
+                        )
+                    elif (
+                        original_exception.status_code == 400
+                        or original_exception.status_code == 413
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AnthropicException - {error_str}",
+                            model=model,
+                            llm_provider="anthropic",
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"AnthropicException - {error_str}",
+                            model=model,
+                            llm_provider="anthropic",
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"AnthropicException - {error_str}",
+                            model=model,
+                            llm_provider="anthropic",
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"AnthropicException - {error_str}",
+                            llm_provider="anthropic",
+                            model=model,
+                        )
+                    elif (
+                        original_exception.status_code == 500
+                        or original_exception.status_code == 529
+                    ):
+                        exception_mapping_worked = True
+                        raise litellm.InternalServerError(
+                            message=f"AnthropicException - {error_str}. Handle with `litellm.InternalServerError`.",
+                            llm_provider="anthropic",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise litellm.ServiceUnavailableError(
+                            message=f"AnthropicException - {error_str}. Handle with `litellm.ServiceUnavailableError`.",
+                            llm_provider="anthropic",
+                            model=model,
+                        )
+            elif custom_llm_provider == "replicate":
+                if "Incorrect authentication token" in error_str:
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"ReplicateException - {error_str}",
+                        llm_provider="replicate",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "input is too long" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"ReplicateException - {error_str}",
+                        model=model,
+                        llm_provider="replicate",
+                        response=original_exception.response,
+                    )
+                elif exception_type == "ModelError":
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"ReplicateException - {error_str}",
+                        model=model,
+                        llm_provider="replicate",
+                        response=original_exception.response,
+                    )
+                elif "Request was throttled" in error_str:
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"ReplicateException - {error_str}",
+                        llm_provider="replicate",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        original_exception.status_code == 400
+                        or original_exception.status_code == 413
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise UnprocessableEntityError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"ReplicateException - {original_exception.message}",
+                            model=model,
+                            llm_provider="replicate",
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise UnprocessableEntityError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                exception_mapping_worked = True
+                raise APIError(
+                    status_code=500,
+                    message=f"ReplicateException - {str(original_exception)}",
+                    llm_provider="replicate",
+                    model=model,
+                    request=httpx.Request(
+                        method="POST",
+                        url="https://api.replicate.com/v1/deployments",
+                    ),
+                )
+            elif custom_llm_provider == "watsonx":
+                if "token_quota_reached" in error_str:
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"WatsonxException: Rate Limit Errror - {error_str}",
+                        llm_provider="watsonx",
+                        model=model,
+                        response=original_exception.response,
+                    )
+            elif (
+                custom_llm_provider == "predibase"
+                or custom_llm_provider == "databricks"
+            ):
+                if "authorization denied for" in error_str:
+                    exception_mapping_worked = True
+
+                    # Predibase returns the raw API Key in the response - this block ensures it's not returned in the exception
+                    if (
+                        error_str is not None
+                        and isinstance(error_str, str)
+                        and "bearer" in error_str.lower()
+                    ):
+                        # only keep the first 10 chars after the occurnence of "bearer"
+                        _bearer_token_start_index = error_str.lower().find("bearer")
+                        error_str = error_str[: _bearer_token_start_index + 14]
+                        error_str += "XXXXXXX" + '"'
+
+                    raise AuthenticationError(
+                        message=f"{custom_llm_provider}Exception: Authentication Error - {error_str}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        response=original_exception.response,
+                        litellm_debug_info=extra_information,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise litellm.InternalServerError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    elif (
+                        original_exception.status_code == 401
+                        or original_exception.status_code == 403
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif (
+                        original_exception.status_code == 422
+                        or original_exception.status_code == 424
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+            elif custom_llm_provider == "bedrock":
+                if (
+                    "too many tokens" in error_str
+                    or "expected maxLength:" in error_str
+                    or "Input is too long" in error_str
+                    or "prompt: length: 1.." in error_str
+                    or "Too many input tokens" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"BedrockException: Context Window Error - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                    )
+                elif "Malformed input request" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"BedrockException - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                        response=original_exception.response,
+                    )
+                elif "A conversation must start with a user message." in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"BedrockException - {error_str}\n. Pass in default user message via `completion(..,user_continue_message=)` or enable `litellm.modify_params=True`.\nFor Proxy: do via `litellm_settings::modify_params: True` or user_continue_message under `litellm_params`",
+                        model=model,
+                        llm_provider="bedrock",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "Unable to locate credentials" in error_str
+                    or "The security token included in the request is invalid"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"BedrockException Invalid Authentication - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                        response=original_exception.response,
+                    )
+                elif "AccessDeniedException" in error_str:
+                    exception_mapping_worked = True
+                    raise PermissionDeniedError(
+                        message=f"BedrockException PermissionDeniedError - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "throttlingException" in error_str
+                    or "ThrottlingException" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"BedrockException: Rate Limit Error - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "Connect timeout on endpoint URL" in error_str
+                    or "timed out" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"BedrockException: Timeout Error - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                    )
+                elif "Could not process image" in error_str:
+                    exception_mapping_worked = True
+                    raise litellm.InternalServerError(
+                        message=f"BedrockException - {error_str}",
+                        model=model,
+                        llm_provider="bedrock",
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model,
+                            response=httpx.Response(
+                                status_code=500,
+                                request=httpx.Request(
+                                    method="POST", url="https://api.openai.com/v1/"
+                                ),
+                            ),
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+            elif custom_llm_provider == "sagemaker":
+                if "Unable to locate credentials" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"litellm.BadRequestError: SagemakerException - {error_str}",
+                        model=model,
+                        llm_provider="sagemaker",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "Input validation error: `best_of` must be > 0 and <= 2"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints",
+                        model=model,
+                        llm_provider="sagemaker",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "`inputs` tokens + `max_new_tokens` must be <=" in error_str
+                    or "instance type with more CPU capacity or memory" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"SagemakerException - {error_str}",
+                        model=model,
+                        llm_provider="sagemaker",
+                        response=original_exception.response,
+                    )
+            elif (
+                custom_llm_provider == "vertex_ai"
+                or custom_llm_provider == "vertex_ai_beta"
+                or custom_llm_provider == "gemini"
+            ):
+                if (
+                    "Vertex AI API has not been used in project" in error_str
+                    or "Unable to find your project" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"litellm.BadRequestError: VertexAIException - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        response=httpx.Response(
+                            status_code=400,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
+                        litellm_debug_info=extra_information,
+                    )
+                if "400 Request payload size exceeds" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"VertexException - {error_str}",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                    )
+                elif (
+                    "None Unknown Error." in error_str
+                    or "Content has no parts." in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise litellm.InternalServerError(
+                        message=f"litellm.InternalServerError: VertexAIException - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        response=httpx.Response(
+                            status_code=500,
+                            content=str(original_exception),
+                            request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                        ),
+                        litellm_debug_info=extra_information,
+                    )
+                elif "API key not valid." in error_str:
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"{custom_llm_provider}Exception - {error_str}",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "403" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"VertexAIException BadRequestError - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        response=httpx.Response(
+                            status_code=403,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "The response was blocked." in error_str
+                    or "Output blocked by content filtering policy"
+                    in error_str  # anthropic on vertex ai
+                ):
+                    exception_mapping_worked = True
+                    raise ContentPolicyViolationError(
+                        message=f"VertexAIException ContentPolicyViolationError - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        litellm_debug_info=extra_information,
+                        response=httpx.Response(
+                            status_code=400,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
+                    )
+                elif (
+                    "429 Quota exceeded" in error_str
+                    or "Quota exceeded for" in error_str
+                    or "IndexError: list index out of range" in error_str
+                    or "429 Unable to submit request because the service is temporarily out of capacity."
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"litellm.RateLimitError: VertexAIException - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        litellm_debug_info=extra_information,
+                        response=httpx.Response(
+                            status_code=429,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
+                    )
+                elif "500 Internal Server Error" in error_str:
+                    exception_mapping_worked = True
+                    raise ServiceUnavailableError(
+                        message=f"litellm.ServiceUnavailableError: VertexAIException - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        litellm_debug_info=extra_information,
+                    )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"VertexAIException BadRequestError - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            litellm_debug_info=extra_information,
+                            response=httpx.Response(
+                                status_code=400,
+                                request=httpx.Request(
+                                    method="POST",
+                                    url="https://cloud.google.com/vertex-ai/",
+                                ),
+                            ),
+                        )
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"VertexAIException - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    if original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"VertexAIException - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+                    if original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"VertexAIException - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+
+                    if original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"litellm.RateLimitError: VertexAIException - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            litellm_debug_info=extra_information,
+                            response=httpx.Response(
+                                status_code=429,
+                                request=httpx.Request(
+                                    method="POST",
+                                    url=" https://cloud.google.com/vertex-ai/",
+                                ),
+                            ),
+                        )
+                    if original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise litellm.InternalServerError(
+                            message=f"VertexAIException InternalServerError - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            litellm_debug_info=extra_information,
+                            response=httpx.Response(
+                                status_code=500,
+                                content=str(original_exception),
+                                request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                            ),
+                        )
+                    if original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"VertexAIException - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                        )
+            elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
+                if "503 Getting metadata" in error_str:
+                    # auth errors look like this
+                    # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"GeminiException - Invalid api key",
+                        model=model,
+                        llm_provider="palm",
+                        response=original_exception.response,
+                    )
+                if (
+                    "504 Deadline expired before operation could complete." in error_str
+                    or "504 Deadline Exceeded" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"GeminiException - {original_exception.message}",
+                        model=model,
+                        llm_provider="palm",
+                    )
+                if "400 Request payload size exceeds" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"GeminiException - {error_str}",
+                        model=model,
+                        llm_provider="palm",
+                        response=original_exception.response,
+                    )
+                if (
+                    "500 An internal error has occurred." in error_str
+                    or "list index out of range" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise APIError(
+                        status_code=getattr(original_exception, "status_code", 500),
+                        message=f"GeminiException - {original_exception.message}",
+                        llm_provider="palm",
+                        model=model,
+                        request=httpx.Response(
+                            status_code=429,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
+                    )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"GeminiException - {error_str}",
+                            model=model,
+                            llm_provider="palm",
+                            response=original_exception.response,
+                        )
+                # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
+            elif custom_llm_provider == "cloudflare":
+                if "Authentication error" in error_str:
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"Cloudflare Exception - {original_exception.message}",
+                        llm_provider="cloudflare",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                if "must have required property" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"Cloudflare Exception - {original_exception.message}",
+                        llm_provider="cloudflare",
+                        model=model,
+                        response=original_exception.response,
+                    )
+            elif (
+                custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat"
+            ):  # Cohere
+                if (
+                    "invalid api token" in error_str
+                    or "No API key provided." in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "too many tokens" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"CohereException - {original_exception.message}",
+                        model=model,
+                        llm_provider="cohere",
+                        response=original_exception.response,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    if (
+                        original_exception.status_code == 400
+                        or original_exception.status_code == 498
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                elif (
+                    "CohereConnectionError" in exception_type
+                ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "invalid type:" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "Unexpected server error" in error_str:
+                    exception_mapping_worked = True
+                    raise ServiceUnavailableError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                else:
+                    if hasattr(original_exception, "status_code"):
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            request=original_exception.request,
+                        )
+                    raise original_exception
+            elif custom_llm_provider == "huggingface":
+                if "length limit exceeded" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=error_str,
+                        model=model,
+                        llm_provider="huggingface",
+                        response=original_exception.response,
+                    )
+                elif "A valid user token is required" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=error_str,
+                        llm_provider="huggingface",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "Rate limit reached" in error_str:
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=error_str,
+                        llm_provider="huggingface",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            model=model,
+                            llm_provider="huggingface",
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            model=model,
+                            llm_provider="huggingface",
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                            model=model,
+                            request=original_exception.request,
+                        )
+            elif custom_llm_provider == "ai21":
+                if hasattr(original_exception, "message"):
+                    if "Prompt has too many tokens" in original_exception.message:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21",
+                            response=original_exception.response,
+                        )
+                    if "Bad or missing API token." in original_exception.message:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21",
+                            response=original_exception.response,
+                        )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            llm_provider="ai21",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21",
+                        )
+                    if original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21",
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            llm_provider="ai21",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"AI21Exception - {original_exception.message}",
+                            llm_provider="ai21",
+                            model=model,
+                            request=original_exception.request,
+                        )
+            elif custom_llm_provider == "nlp_cloud":
+                if "detail" in error_str:
+                    if "Input text length should not exceed" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud",
+                            response=original_exception.response,
+                        )
+                    elif "value is not a valid" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud",
+                            response=original_exception.response,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=500,
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud",
+                            request=original_exception.request,
+                        )
+                if hasattr(
+                    original_exception, "status_code"
+                ):  # https://docs.nlpcloud.com/?shell#errors
+                    if (
+                        original_exception.status_code == 400
+                        or original_exception.status_code == 406
+                        or original_exception.status_code == 413
+                        or original_exception.status_code == 422
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        original_exception.status_code == 401
+                        or original_exception.status_code == 403
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        original_exception.status_code == 522
+                        or original_exception.status_code == 524
+                    ):
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            model=model,
+                            llm_provider="nlp_cloud",
+                        )
+                    elif (
+                        original_exception.status_code == 429
+                        or original_exception.status_code == 402
+                    ):
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        original_exception.status_code == 500
+                        or original_exception.status_code == 503
+                    ):
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model,
+                            request=original_exception.request,
+                        )
+                    elif (
+                        original_exception.status_code == 504
+                        or original_exception.status_code == 520
+                    ):
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            model=model,
+                            llm_provider="nlp_cloud",
+                            response=original_exception.response,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model,
+                            request=original_exception.request,
+                        )
+            elif custom_llm_provider == "together_ai":
+                try:
+                    error_response = json.loads(error_str)
+                except Exception:
+                    error_response = {"error": error_str}
+                if (
+                    "error" in error_response
+                    and "`inputs` tokens + `max_new_tokens` must be <="
+                    in error_response["error"]
+                ):
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "error" in error_response
+                    and "invalid private key" in error_response["error"]
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        llm_provider="together_ai",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif (
+                    "error" in error_response
+                    and "INVALID_ARGUMENT" in error_response["error"]
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai",
+                        response=original_exception.response,
+                    )
+                elif "A timeout occurred" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"TogetherAIException - {error_str}",
+                        model=model,
+                        llm_provider="together_ai",
+                    )
+                elif (
+                    "error" in error_response
+                    and "API key doesn't match expected format."
+                    in error_response["error"]
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai",
+                        response=original_exception.response,
+                    )
+                elif (
+                    "error_type" in error_response
+                    and error_response["error_type"] == "validation"
+                ):
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai",
+                        response=original_exception.response,
+                    )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"TogetherAIException - {original_exception.message}",
+                            model=model,
+                            llm_provider="together_ai",
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"TogetherAIException - {original_exception.message}",
+                            llm_provider="together_ai",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 524:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"TogetherAIException - {original_exception.message}",
+                            llm_provider="together_ai",
+                            model=model,
+                        )
+                else:
+                    exception_mapping_worked = True
+                    raise APIError(
+                        status_code=original_exception.status_code,
+                        message=f"TogetherAIException - {original_exception.message}",
+                        llm_provider="together_ai",
+                        model=model,
+                        request=original_exception.request,
+                    )
+            elif custom_llm_provider == "aleph_alpha":
+                if (
+                    "This is longer than the model's maximum context length"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"AlephAlphaException - {original_exception.message}",
+                        llm_provider="aleph_alpha",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "InvalidToken" in error_str or "No token provided" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"AlephAlphaException - {original_exception.message}",
+                        llm_provider="aleph_alpha",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    verbose_logger.debug(
+                        f"status code: {original_exception.status_code}"
+                    )
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    raise original_exception
+                raise original_exception
+            elif (
+                custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat"
+            ):
+                if isinstance(original_exception, dict):
+                    error_str = original_exception.get("error", "")
+                else:
+                    error_str = str(original_exception)
+                if "no such file or directory" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
+                        model=model,
+                        llm_provider="ollama",
+                        response=original_exception.response,
+                    )
+                elif "Failed to establish a new connection" in error_str:
+                    exception_mapping_worked = True
+                    raise ServiceUnavailableError(
+                        message=f"OllamaException: {original_exception}",
+                        llm_provider="ollama",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "Invalid response object from API" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"OllamaException: {original_exception}",
+                        llm_provider="ollama",
+                        model=model,
+                        response=original_exception.response,
+                    )
+                elif "Read timed out" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"OllamaException: {original_exception}",
+                        llm_provider="ollama",
+                        model=model,
+                    )
+            elif custom_llm_provider == "vllm":
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 0:
+                        exception_mapping_worked = True
+                        raise APIConnectionError(
+                            message=f"VLLMException - {original_exception.message}",
+                            llm_provider="vllm",
+                            model=model,
+                            request=original_exception.request,
+                        )
+            elif custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
+                message = get_error_message(error_obj=original_exception)
+                if message is None:
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+
+                if "Internal server error" in error_str:
+                    exception_mapping_worked = True
+                    raise litellm.InternalServerError(
+                        message=f"AzureException Internal server error - {message}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "This model's maximum context length is" in error_str:
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"AzureException ContextWindowExceededError - {message}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "DeploymentNotFound" in error_str:
+                    exception_mapping_worked = True
+                    raise NotFoundError(
+                        message=f"AzureException NotFoundError - {message}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    (
+                        "invalid_request_error" in error_str
+                        and "content_policy_violation" in error_str
+                    )
+                    or (
+                        "The response was filtered due to the prompt triggering Azure OpenAI's content management"
+                        in error_str
+                    )
+                    or "Your task failed as a result of our safety system" in error_str
+                    or "The model produced invalid content" in error_str
+                    or "content_filter_policy" in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise ContentPolicyViolationError(
+                        message=f"litellm.ContentPolicyViolationError: AzureException - {message}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "invalid_request_error" in error_str:
+                    exception_mapping_worked = True
+                    raise BadRequestError(
+                        message=f"AzureException BadRequestError - {message}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif (
+                    "The api_key client option must be set either by passing api_key to the client or by setting"
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"{exception_provider} AuthenticationError - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif "Connection error" in error_str:
+                    exception_mapping_worked = True
+                    raise APIConnectionError(
+                        message=f"{exception_provider} APIConnectionError - {message}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        litellm_debug_info=extra_information,
+                    )
+                elif hasattr(original_exception, "status_code"):
+                    exception_mapping_worked = True
+                    if original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AzureException - {message}",
+                            llm_provider="azure",
+                            model=model,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AzureException AuthenticationError - {message}",
+                            llm_provider="azure",
+                            model=model,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"AzureException Timeout - {message}",
+                            model=model,
+                            litellm_debug_info=extra_information,
+                            llm_provider="azure",
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AzureException BadRequestError - {message}",
+                            model=model,
+                            llm_provider="azure",
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"AzureException RateLimitError - {message}",
+                            model=model,
+                            llm_provider="azure",
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"AzureException ServiceUnavailableError - {message}",
+                            model=model,
+                            llm_provider="azure",
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"AzureException Timeout - {message}",
+                            model=model,
+                            litellm_debug_info=extra_information,
+                            llm_provider="azure",
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"AzureException APIError - {message}",
+                            llm_provider="azure",
+                            litellm_debug_info=extra_information,
+                            model=model,
+                            request=httpx.Request(
+                                method="POST", url="https://openai.com/"
+                            ),
+                        )
+                else:
+                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                    raise APIConnectionError(
+                        message=f"{exception_provider} APIConnectionError - {message}\n{traceback.format_exc()}",
+                        llm_provider="azure",
+                        model=model,
+                        litellm_debug_info=extra_information,
+                        request=httpx.Request(method="POST", url="https://openai.com/"),
+                    )
+            if custom_llm_provider == "openrouter":
+                if hasattr(original_exception, "status_code"):
+                    exception_mapping_worked = True
+                    if original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {error_str}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AuthenticationError: {exception_provider} - {error_str}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"NotFoundError: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"Timeout Error: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"BadRequestError: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"RateLimitError: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"ServiceUnavailableError: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"Timeout Error: {exception_provider} - {error_str}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"APIError: {exception_provider} - {error_str}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=original_exception.request,
+                            litellm_debug_info=extra_information,
+                        )
+                else:
+                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                    raise APIConnectionError(
+                        message=f"APIConnectionError: {exception_provider} - {error_str}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        litellm_debug_info=extra_information,
+                        request=httpx.Request(
+                            method="POST", url="https://api.openai.com/v1/"
+                        ),
+                    )
+        if (
+            "BadRequestError.__init__() missing 1 required positional argument: 'param'"
+            in str(original_exception)
+        ):  # deal with edge-case invalid request error bug in openai-python sdk
+            exception_mapping_worked = True
+            raise BadRequestError(
+                message=f"{exception_provider} BadRequestError : This can happen due to missing AZURE_API_VERSION: {str(original_exception)}",
+                model=model,
+                llm_provider=custom_llm_provider,
+                response=getattr(original_exception, "response", None),
+            )
+        else:  # ensure generic errors always return APIConnectionError=
+            """
+            For unmapped exceptions - raise the exception with traceback - https://github.com/BerriAI/litellm/issues/4201
+            """
+            exception_mapping_worked = True
+            if hasattr(original_exception, "request"):
+                raise APIConnectionError(
+                    message="{} - {}".format(exception_provider, error_str),
+                    llm_provider=custom_llm_provider,
+                    model=model,
+                    request=original_exception.request,
+                )
+            else:
+                raise APIConnectionError(
+                    message="{}\n{}".format(
+                        str(original_exception), traceback.format_exc()
+                    ),
+                    llm_provider=custom_llm_provider,
+                    model=model,
+                    request=httpx.Request(
+                        method="POST", url="https://api.openai.com/v1/"
+                    ),  # stub the request
+                )
+    except Exception as e:
+        # LOGGING
+        exception_logging(
+            logger_fn=None,
+            additional_args={
+                "exception_mapping_worked": exception_mapping_worked,
+                "original_exception": original_exception,
+            },
+            exception=e,
+        )
+
+        # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
+        if exception_mapping_worked:
+            setattr(e, "litellm_response_headers", litellm_response_headers)
+            raise e
+        else:
+            for error_type in litellm.LITELLM_EXCEPTION_TYPES:
+                if isinstance(e, error_type):
+                    setattr(e, "litellm_response_headers", litellm_response_headers)
+                    raise e  # it's already mapped
+            raised_exc = APIConnectionError(
+                message="{}\n{}".format(original_exception, traceback.format_exc()),
+                llm_provider="",
+                model="",
+            )
+            setattr(raised_exc, "litellm_response_headers", litellm_response_headers)
+            raise raised_exc
+
+
+####### LOGGING ###################
+
+
+def exception_logging(
+    additional_args={},
+    logger_fn=None,
+    exception=None,
+):
+    try:
+        model_call_details = {}
+        if exception:
+            model_call_details["exception"] = exception
+        model_call_details["additional_args"] = additional_args
+        # User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
+        verbose_logger.debug(
+            f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}"
+        )
+        if logger_fn and callable(logger_fn):
+            try:
+                logger_fn(
+                    model_call_details
+                )  # Expectation: any logger function passed in by the user should accept a dict object
+            except Exception as e:
+                verbose_logger.debug(
+                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
+                )
+    except Exception as e:
+        verbose_logger.debug(
+            f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
+        )
+        pass
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index 85a2b3cd2..8b5c15ca3 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -1015,9 +1015,8 @@ class Logging:
                                 != langFuseLogger.public_key
                             )
                             or (
-                                self.langfuse_public_key is not None
-                                and self.langfuse_public_key
-                                != langFuseLogger.public_key
+                                self.langfuse_secret is not None
+                                and self.langfuse_secret != langFuseLogger.secret_key
                             )
                             or (
                                 self.langfuse_host is not None
@@ -1045,7 +1044,6 @@ class Logging:
                                     service_name="langfuse",
                                     logging_obj=temp_langfuse_logger,
                                 )
-
                         if temp_langfuse_logger is not None:
                             _response = temp_langfuse_logger.log_event(
                                 kwargs=kwargs,
diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py
index e0ab26b98..d586496fc 100644
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@@ -220,104 +220,6 @@ class DeepInfraConfig:
         return optional_params
 
 
-class GroqConfig:
-    """
-    Reference: https://deepinfra.com/docs/advanced/openai_api
-
-    The class `DeepInfra` provides configuration for the DeepInfra's Chat Completions API interface. Below are the parameters:
-    """
-
-    frequency_penalty: Optional[int] = None
-    function_call: Optional[Union[str, dict]] = None
-    functions: Optional[list] = None
-    logit_bias: Optional[dict] = None
-    max_tokens: Optional[int] = None
-    n: Optional[int] = None
-    presence_penalty: Optional[int] = None
-    stop: Optional[Union[str, list]] = None
-    temperature: Optional[int] = None
-    top_p: Optional[int] = None
-    response_format: Optional[dict] = None
-    tools: Optional[list] = None
-    tool_choice: Optional[Union[str, dict]] = None
-
-    def __init__(
-        self,
-        frequency_penalty: Optional[int] = None,
-        function_call: Optional[Union[str, dict]] = None,
-        functions: Optional[list] = None,
-        logit_bias: Optional[dict] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[int] = None,
-        stop: Optional[Union[str, list]] = None,
-        temperature: Optional[int] = None,
-        top_p: Optional[int] = None,
-        response_format: Optional[dict] = None,
-        tools: Optional[list] = None,
-        tool_choice: Optional[Union[str, dict]] = None,
-    ) -> None:
-        locals_ = locals().copy()
-        for key, value in locals_.items():
-            if key != "self" and value is not None:
-                setattr(self.__class__, key, value)
-
-    @classmethod
-    def get_config(cls):
-        return {
-            k: v
-            for k, v in cls.__dict__.items()
-            if not k.startswith("__")
-            and not isinstance(
-                v,
-                (
-                    types.FunctionType,
-                    types.BuiltinFunctionType,
-                    classmethod,
-                    staticmethod,
-                ),
-            )
-            and v is not None
-        }
-
-    def get_supported_openai_params_stt(self):
-        return [
-            "prompt",
-            "response_format",
-            "temperature",
-            "language",
-        ]
-
-    def get_supported_openai_response_formats_stt(self) -> List[str]:
-        return ["json", "verbose_json", "text"]
-
-    def map_openai_params_stt(
-        self,
-        non_default_params: dict,
-        optional_params: dict,
-        model: str,
-        drop_params: bool,
-    ) -> dict:
-        response_formats = self.get_supported_openai_response_formats_stt()
-        for param, value in non_default_params.items():
-            if param == "response_format":
-                if value in response_formats:
-                    optional_params[param] = value
-                else:
-                    if litellm.drop_params is True or drop_params is True:
-                        pass
-                    else:
-                        raise litellm.utils.UnsupportedParamsError(
-                            message="Groq doesn't support response_format={}. To drop unsupported openai params from the call, set `litellm.drop_params = True`".format(
-                                value
-                            ),
-                            status_code=400,
-                        )
-            else:
-                optional_params[param] = value
-        return optional_params
-
-
 class OpenAIConfig:
     """
     Reference: https://platform.openai.com/docs/api-reference/chat/create
diff --git a/litellm/llms/anthropic/chat/__init__.py b/litellm/llms/anthropic/chat/__init__.py
new file mode 100644
index 000000000..ae84c3b1e
--- /dev/null
+++ b/litellm/llms/anthropic/chat/__init__.py
@@ -0,0 +1 @@
+from .handler import AnthropicChatCompletion, ModelResponseIterator
diff --git a/litellm/llms/anthropic/chat.py b/litellm/llms/anthropic/chat/handler.py
similarity index 50%
rename from litellm/llms/anthropic/chat.py
rename to litellm/llms/anthropic/chat/handler.py
index cf4f23905..3603183c4 100644
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat/handler.py
@@ -71,12 +71,19 @@ from litellm.types.llms.openai import (
     ChatCompletionToolParamFunctionChunk,
     ChatCompletionUsageBlock,
     ChatCompletionUserMessage,
+    OpenAIMessageContent,
 )
 from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
-from ..base import BaseLLM
-from ..prompt_templates.factory import custom_prompt, prompt_factory
+from ...base import BaseLLM
+from ...prompt_templates.factory import (
+    anthropic_messages_pt,
+    custom_prompt,
+    prompt_factory,
+)
+from ..common_utils import AnthropicError
+from .transformation import AnthropicConfig
 
 
 class AnthropicConstants(Enum):
@@ -86,558 +93,6 @@ class AnthropicConstants(Enum):
     # constants from https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/_constants.py
 
 
-class AnthropicError(Exception):
-    def __init__(self, status_code: int, message):
-        self.status_code = status_code
-        self.message: str = message
-        self.request = httpx.Request(
-            method="POST", url="https://api.anthropic.com/v1/messages"
-        )
-        self.response = httpx.Response(status_code=status_code, request=self.request)
-        super().__init__(
-            self.message
-        )  # Call the base class constructor with the parameters it needs
-
-
-class AnthropicConfig:
-    """
-    Reference: https://docs.anthropic.com/claude/reference/messages_post
-
-    to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
-    """
-
-    max_tokens: Optional[int] = (
-        4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
-    )
-    stop_sequences: Optional[list] = None
-    temperature: Optional[int] = None
-    top_p: Optional[int] = None
-    top_k: Optional[int] = None
-    metadata: Optional[dict] = None
-    system: Optional[str] = None
-
-    def __init__(
-        self,
-        max_tokens: Optional[
-            int
-        ] = 4096,  # You can pass in a value yourself or use the default value 4096
-        stop_sequences: Optional[list] = None,
-        temperature: Optional[int] = None,
-        top_p: Optional[int] = None,
-        top_k: Optional[int] = None,
-        metadata: Optional[dict] = None,
-        system: Optional[str] = None,
-    ) -> None:
-        locals_ = locals()
-        for key, value in locals_.items():
-            if key != "self" and value is not None:
-                setattr(self.__class__, key, value)
-
-    @classmethod
-    def get_config(cls):
-        return {
-            k: v
-            for k, v in cls.__dict__.items()
-            if not k.startswith("__")
-            and not isinstance(
-                v,
-                (
-                    types.FunctionType,
-                    types.BuiltinFunctionType,
-                    classmethod,
-                    staticmethod,
-                ),
-            )
-            and v is not None
-        }
-
-    def get_supported_openai_params(self):
-        return [
-            "stream",
-            "stop",
-            "temperature",
-            "top_p",
-            "max_tokens",
-            "max_completion_tokens",
-            "tools",
-            "tool_choice",
-            "extra_headers",
-        ]
-
-    def get_cache_control_headers(self) -> dict:
-        return {
-            "anthropic-version": "2023-06-01",
-            "anthropic-beta": "prompt-caching-2024-07-31",
-        }
-
-    def map_openai_params(self, non_default_params: dict, optional_params: dict):
-        for param, value in non_default_params.items():
-            if param == "max_tokens":
-                optional_params["max_tokens"] = value
-            if param == "max_completion_tokens":
-                optional_params["max_tokens"] = value
-            if param == "tools":
-                optional_params["tools"] = value
-            if param == "tool_choice":
-                _tool_choice: Optional[AnthropicMessagesToolChoice] = None
-                if value == "auto":
-                    _tool_choice = {"type": "auto"}
-                elif value == "required":
-                    _tool_choice = {"type": "any"}
-                elif isinstance(value, dict):
-                    _tool_choice = {"type": "tool", "name": value["function"]["name"]}
-
-                if _tool_choice is not None:
-                    optional_params["tool_choice"] = _tool_choice
-            if param == "stream" and value == True:
-                optional_params["stream"] = value
-            if param == "stop":
-                if isinstance(value, str):
-                    if (
-                        value == "\n"
-                    ) and litellm.drop_params == True:  # anthropic doesn't allow whitespace characters as stop-sequences
-                        continue
-                    value = [value]
-                elif isinstance(value, list):
-                    new_v = []
-                    for v in value:
-                        if (
-                            v == "\n"
-                        ) and litellm.drop_params == True:  # anthropic doesn't allow whitespace characters as stop-sequences
-                            continue
-                        new_v.append(v)
-                    if len(new_v) > 0:
-                        value = new_v
-                    else:
-                        continue
-                optional_params["stop_sequences"] = value
-            if param == "temperature":
-                optional_params["temperature"] = value
-            if param == "top_p":
-                optional_params["top_p"] = value
-        return optional_params
-
-    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
-        """
-        Return if {"cache_control": ..} in message content block
-
-        Used to check if anthropic prompt caching headers need to be set.
-        """
-        for message in messages:
-            if message["content"] is not None and isinstance(message["content"], list):
-                for content in message["content"]:
-                    if "cache_control" in content:
-                        return True
-
-        return False
-
-    def translate_system_message(
-        self, messages: List[AllMessageValues]
-    ) -> List[AnthropicSystemMessageContent]:
-        system_prompt_indices = []
-        anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
-        for idx, message in enumerate(messages):
-            if message["role"] == "system":
-                valid_content: bool = False
-                system_message_block = ChatCompletionSystemMessage(**message)
-                if isinstance(system_message_block["content"], str):
-                    anthropic_system_message_content = AnthropicSystemMessageContent(
-                        type="text",
-                        text=system_message_block["content"],
-                    )
-                    if "cache_control" in system_message_block:
-                        anthropic_system_message_content["cache_control"] = (
-                            system_message_block["cache_control"]
-                        )
-                    anthropic_system_message_list.append(
-                        anthropic_system_message_content
-                    )
-                    valid_content = True
-                elif isinstance(message["content"], list):
-                    for _content in message["content"]:
-                        anthropic_system_message_content = (
-                            AnthropicSystemMessageContent(
-                                type=_content.get("type"),
-                                text=_content.get("text"),
-                            )
-                        )
-                        if "cache_control" in _content:
-                            anthropic_system_message_content["cache_control"] = (
-                                _content["cache_control"]
-                            )
-
-                        anthropic_system_message_list.append(
-                            anthropic_system_message_content
-                        )
-                    valid_content = True
-
-                if valid_content:
-                    system_prompt_indices.append(idx)
-        if len(system_prompt_indices) > 0:
-            for idx in reversed(system_prompt_indices):
-                messages.pop(idx)
-
-        return anthropic_system_message_list
-
-    ### FOR [BETA] `/v1/messages` endpoint support
-
-    def translatable_anthropic_params(self) -> List:
-        """
-        Which anthropic params, we need to translate to the openai format.
-        """
-        return ["messages", "metadata", "system", "tool_choice", "tools"]
-
-    def translate_anthropic_messages_to_openai(
-        self,
-        messages: List[
-            Union[
-                AnthropicMessagesUserMessageParam,
-                AnthopicMessagesAssistantMessageParam,
-            ]
-        ],
-    ) -> List:
-        new_messages: List[AllMessageValues] = []
-        for m in messages:
-            user_message: Optional[ChatCompletionUserMessage] = None
-            tool_message_list: List[ChatCompletionToolMessage] = []
-            new_user_content_list: List[
-                Union[ChatCompletionTextObject, ChatCompletionImageObject]
-            ] = []
-            ## USER MESSAGE ##
-            if m["role"] == "user":
-                ## translate user message
-                if isinstance(m["content"], str):
-                    user_message = ChatCompletionUserMessage(
-                        role="user", content=m["content"]
-                    )
-                elif isinstance(m["content"], list):
-                    for content in m["content"]:
-                        if content["type"] == "text":
-                            text_obj = ChatCompletionTextObject(
-                                type="text", text=content["text"]
-                            )
-                            new_user_content_list.append(text_obj)
-                        elif content["type"] == "image":
-                            image_url = ChatCompletionImageUrlObject(
-                                url=f"data:{content['type']};base64,{content['source']}"
-                            )
-                            image_obj = ChatCompletionImageObject(
-                                type="image_url", image_url=image_url
-                            )
-
-                            new_user_content_list.append(image_obj)
-                        elif content["type"] == "tool_result":
-                            if "content" not in content:
-                                tool_result = ChatCompletionToolMessage(
-                                    role="tool",
-                                    tool_call_id=content["tool_use_id"],
-                                    content="",
-                                )
-                                tool_message_list.append(tool_result)
-                            elif isinstance(content["content"], str):
-                                tool_result = ChatCompletionToolMessage(
-                                    role="tool",
-                                    tool_call_id=content["tool_use_id"],
-                                    content=content["content"],
-                                )
-                                tool_message_list.append(tool_result)
-                            elif isinstance(content["content"], list):
-                                for c in content["content"]:
-                                    if c["type"] == "text":
-                                        tool_result = ChatCompletionToolMessage(
-                                            role="tool",
-                                            tool_call_id=content["tool_use_id"],
-                                            content=c["text"],
-                                        )
-                                        tool_message_list.append(tool_result)
-                                    elif c["type"] == "image":
-                                        image_str = (
-                                            f"data:{c['type']};base64,{c['source']}"
-                                        )
-                                        tool_result = ChatCompletionToolMessage(
-                                            role="tool",
-                                            tool_call_id=content["tool_use_id"],
-                                            content=image_str,
-                                        )
-                                        tool_message_list.append(tool_result)
-
-            if user_message is not None:
-                new_messages.append(user_message)
-
-            if len(new_user_content_list) > 0:
-                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
-
-            if len(tool_message_list) > 0:
-                new_messages.extend(tool_message_list)
-
-            ## ASSISTANT MESSAGE ##
-            assistant_message_str: Optional[str] = None
-            tool_calls: List[ChatCompletionAssistantToolCall] = []
-            if m["role"] == "assistant":
-                if isinstance(m["content"], str):
-                    assistant_message_str = m["content"]
-                elif isinstance(m["content"], list):
-                    for content in m["content"]:
-                        if content["type"] == "text":
-                            if assistant_message_str is None:
-                                assistant_message_str = content["text"]
-                            else:
-                                assistant_message_str += content["text"]
-                        elif content["type"] == "tool_use":
-                            function_chunk = ChatCompletionToolCallFunctionChunk(
-                                name=content["name"],
-                                arguments=json.dumps(content["input"]),
-                            )
-
-                            tool_calls.append(
-                                ChatCompletionAssistantToolCall(
-                                    id=content["id"],
-                                    type="function",
-                                    function=function_chunk,
-                                )
-                            )
-
-            if assistant_message_str is not None or len(tool_calls) > 0:
-                assistant_message = ChatCompletionAssistantMessage(
-                    role="assistant",
-                    content=assistant_message_str,
-                )
-                if len(tool_calls) > 0:
-                    assistant_message["tool_calls"] = tool_calls
-                new_messages.append(assistant_message)
-
-        return new_messages
-
-    def translate_anthropic_tool_choice_to_openai(
-        self, tool_choice: AnthropicMessagesToolChoice
-    ) -> ChatCompletionToolChoiceValues:
-        if tool_choice["type"] == "any":
-            return "required"
-        elif tool_choice["type"] == "auto":
-            return "auto"
-        elif tool_choice["type"] == "tool":
-            tc_function_param = ChatCompletionToolChoiceFunctionParam(
-                name=tool_choice.get("name", "")
-            )
-            return ChatCompletionToolChoiceObjectParam(
-                type="function", function=tc_function_param
-            )
-        else:
-            raise ValueError(
-                "Incompatible tool choice param submitted - {}".format(tool_choice)
-            )
-
-    def translate_anthropic_tools_to_openai(
-        self, tools: List[AnthropicMessagesTool]
-    ) -> List[ChatCompletionToolParam]:
-        new_tools: List[ChatCompletionToolParam] = []
-        for tool in tools:
-            function_chunk = ChatCompletionToolParamFunctionChunk(
-                name=tool["name"],
-                parameters=tool["input_schema"],
-            )
-            if "description" in tool:
-                function_chunk["description"] = tool["description"]
-            new_tools.append(
-                ChatCompletionToolParam(type="function", function=function_chunk)
-            )
-
-        return new_tools
-
-    def translate_anthropic_to_openai(
-        self, anthropic_message_request: AnthropicMessagesRequest
-    ) -> ChatCompletionRequest:
-        """
-        This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
-        """
-        new_messages: List[AllMessageValues] = []
-
-        ## CONVERT ANTHROPIC MESSAGES TO OPENAI
-        new_messages = self.translate_anthropic_messages_to_openai(
-            messages=anthropic_message_request["messages"]
-        )
-        ## ADD SYSTEM MESSAGE TO MESSAGES
-        if "system" in anthropic_message_request:
-            new_messages.insert(
-                0,
-                ChatCompletionSystemMessage(
-                    role="system", content=anthropic_message_request["system"]
-                ),
-            )
-
-        new_kwargs: ChatCompletionRequest = {
-            "model": anthropic_message_request["model"],
-            "messages": new_messages,
-        }
-        ## CONVERT METADATA (user_id)
-        if "metadata" in anthropic_message_request:
-            if "user_id" in anthropic_message_request["metadata"]:
-                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
-
-        # Pass litellm proxy specific metadata
-        if "litellm_metadata" in anthropic_message_request:
-            # metadata will be passed to litellm.acompletion(), it's a litellm_param
-            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
-
-        ## CONVERT TOOL CHOICE
-        if "tool_choice" in anthropic_message_request:
-            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
-                tool_choice=anthropic_message_request["tool_choice"]
-            )
-        ## CONVERT TOOLS
-        if "tools" in anthropic_message_request:
-            new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
-                tools=anthropic_message_request["tools"]
-            )
-
-        translatable_params = self.translatable_anthropic_params()
-        for k, v in anthropic_message_request.items():
-            if k not in translatable_params:  # pass remaining params as is
-                new_kwargs[k] = v  # type: ignore
-
-        return new_kwargs
-
-    def _translate_openai_content_to_anthropic(
-        self, choices: List[Choices]
-    ) -> List[
-        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
-    ]:
-        new_content: List[
-            Union[
-                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
-            ]
-        ] = []
-        for choice in choices:
-            if (
-                choice.message.tool_calls is not None
-                and len(choice.message.tool_calls) > 0
-            ):
-                for tool_call in choice.message.tool_calls:
-                    new_content.append(
-                        AnthropicResponseContentBlockToolUse(
-                            type="tool_use",
-                            id=tool_call.id,
-                            name=tool_call.function.name or "",
-                            input=json.loads(tool_call.function.arguments),
-                        )
-                    )
-            elif choice.message.content is not None:
-                new_content.append(
-                    AnthropicResponseContentBlockText(
-                        type="text", text=choice.message.content
-                    )
-                )
-
-        return new_content
-
-    def _translate_openai_finish_reason_to_anthropic(
-        self, openai_finish_reason: str
-    ) -> AnthropicFinishReason:
-        if openai_finish_reason == "stop":
-            return "end_turn"
-        elif openai_finish_reason == "length":
-            return "max_tokens"
-        elif openai_finish_reason == "tool_calls":
-            return "tool_use"
-        return "end_turn"
-
-    def translate_openai_response_to_anthropic(
-        self, response: litellm.ModelResponse
-    ) -> AnthropicResponse:
-        ## translate content block
-        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
-        ## extract finish reason
-        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
-            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
-        )
-        # extract usage
-        usage: litellm.Usage = getattr(response, "usage")
-        anthropic_usage = AnthropicResponseUsageBlock(
-            input_tokens=usage.prompt_tokens or 0,
-            output_tokens=usage.completion_tokens or 0,
-        )
-        translated_obj = AnthropicResponse(
-            id=response.id,
-            type="message",
-            role="assistant",
-            model=response.model or "unknown-model",
-            stop_sequence=None,
-            usage=anthropic_usage,
-            content=anthropic_content,
-            stop_reason=anthropic_finish_reason,
-        )
-
-        return translated_obj
-
-    def _translate_streaming_openai_chunk_to_anthropic(
-        self, choices: List[OpenAIStreamingChoice]
-    ) -> Tuple[
-        Literal["text_delta", "input_json_delta"],
-        Union[ContentTextBlockDelta, ContentJsonBlockDelta],
-    ]:
-        text: str = ""
-        partial_json: Optional[str] = None
-        for choice in choices:
-            if choice.delta.content is not None:
-                text += choice.delta.content
-            elif choice.delta.tool_calls is not None:
-                partial_json = ""
-                for tool in choice.delta.tool_calls:
-                    if (
-                        tool.function is not None
-                        and tool.function.arguments is not None
-                    ):
-                        partial_json += tool.function.arguments
-
-        if partial_json is not None:
-            return "input_json_delta", ContentJsonBlockDelta(
-                type="input_json_delta", partial_json=partial_json
-            )
-        else:
-            return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
-
-    def translate_streaming_openai_response_to_anthropic(
-        self, response: litellm.ModelResponse
-    ) -> Union[ContentBlockDelta, MessageBlockDelta]:
-        ## base case - final chunk w/ finish reason
-        if response.choices[0].finish_reason is not None:
-            delta = MessageDelta(
-                stop_reason=self._translate_openai_finish_reason_to_anthropic(
-                    response.choices[0].finish_reason
-                ),
-            )
-            if getattr(response, "usage", None) is not None:
-                litellm_usage_chunk: Optional[litellm.Usage] = response.usage  # type: ignore
-            elif (
-                hasattr(response, "_hidden_params")
-                and "usage" in response._hidden_params
-            ):
-                litellm_usage_chunk = response._hidden_params["usage"]
-            else:
-                litellm_usage_chunk = None
-            if litellm_usage_chunk is not None:
-                usage_delta = UsageDelta(
-                    input_tokens=litellm_usage_chunk.prompt_tokens or 0,
-                    output_tokens=litellm_usage_chunk.completion_tokens or 0,
-                )
-            else:
-                usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
-            return MessageBlockDelta(
-                type="message_delta", delta=delta, usage=usage_delta
-            )
-        (
-            type_of_content,
-            content_block_delta,
-        ) = self._translate_streaming_openai_chunk_to_anthropic(
-            choices=response.choices  # type: ignore
-        )
-        return ContentBlockDelta(
-            type="content_block_delta",
-            index=response.choices[0].index,
-            delta=content_block_delta,
-        )
-
-
 # makes headers for API call
 def validate_environment(
     api_key, user_headers, model, messages: List[AllMessageValues]
@@ -684,8 +139,14 @@ async def make_call(
             api_base, headers=headers, data=data, stream=True, timeout=timeout
         )
     except httpx.HTTPStatusError as e:
+        error_headers = getattr(e, "headers", None)
+        error_response = getattr(e, "response", None)
+        if error_headers is None and error_response:
+            error_headers = getattr(error_response, "headers", None)
         raise AnthropicError(
-            status_code=e.response.status_code, message=await e.response.aread()
+            status_code=e.response.status_code,
+            message=await e.response.aread(),
+            headers=error_headers,
         )
     except Exception as e:
         for exception in litellm.LITELLM_EXCEPTION_TYPES:
@@ -726,8 +187,14 @@ def make_sync_call(
             api_base, headers=headers, data=data, stream=True, timeout=timeout
         )
     except httpx.HTTPStatusError as e:
+        error_headers = getattr(e, "headers", None)
+        error_response = getattr(e, "response", None)
+        if error_headers is None and error_response:
+            error_headers = getattr(error_response, "headers", None)
         raise AnthropicError(
-            status_code=e.response.status_code, message=e.response.read()
+            status_code=e.response.status_code,
+            message=e.response.read(),
+            headers=error_headers,
         )
     except Exception as e:
         for exception in litellm.LITELLM_EXCEPTION_TYPES:
@@ -736,7 +203,12 @@ def make_sync_call(
         raise AnthropicError(status_code=500, message=str(e))
 
     if response.status_code != 200:
-        raise AnthropicError(status_code=response.status_code, message=response.read())
+        response_headers = getattr(response, "headers", None)
+        raise AnthropicError(
+            status_code=response.status_code,
+            message=response.read(),
+            headers=response_headers,
+        )
 
     completion_stream = ModelResponseIterator(
         streaming_response=response.iter_lines(), sync_stream=True
@@ -763,7 +235,7 @@ class AnthropicChatCompletion(BaseLLM):
         response: Union[requests.Response, httpx.Response],
         model_response: ModelResponse,
         stream: bool,
-        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,  # type: ignore
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
@@ -772,6 +244,14 @@ class AnthropicChatCompletion(BaseLLM):
         encoding,
         json_mode: bool,
     ) -> ModelResponse:
+        _hidden_params = {}
+        _response_headers = dict(response.headers)
+        if _response_headers is not None:
+            llm_response_headers = {
+                "{}-{}".format("llm_provider", k): v
+                for k, v in _response_headers.items()
+            }
+            _hidden_params["additional_headers"] = llm_response_headers
         ## LOGGING
         logging_obj.post_call(
             input=messages,
@@ -783,14 +263,21 @@ class AnthropicChatCompletion(BaseLLM):
         ## RESPONSE OBJECT
         try:
             completion_response = response.json()
-        except:
+        except Exception as e:
+            response_headers = getattr(response, "headers", None)
             raise AnthropicError(
-                message=response.text, status_code=response.status_code
+                message="Unable to get json response - {}, Original Response: {}".format(
+                    str(e), response.text
+                ),
+                status_code=response.status_code,
+                headers=response_headers,
             )
         if "error" in completion_response:
+            response_headers = getattr(response, "headers", None)
             raise AnthropicError(
                 message=str(completion_response["error"]),
                 status_code=response.status_code,
+                headers=response_headers,
             )
         else:
             text_content = ""
@@ -856,6 +343,8 @@ class AnthropicChatCompletion(BaseLLM):
         if "cache_read_input_tokens" in _usage:
             usage["cache_read_input_tokens"] = _usage["cache_read_input_tokens"]
         setattr(model_response, "usage", usage)  # type: ignore
+
+        model_response._hidden_params = _hidden_params
         return model_response
 
     async def acompletion_stream_function(
@@ -919,9 +408,9 @@ class AnthropicChatCompletion(BaseLLM):
         litellm_params=None,
         logger_fn=None,
         headers={},
-        client=None,
+        client: Optional[AsyncHTTPHandler] = None,
     ) -> Union[ModelResponse, CustomStreamWrapper]:
-        async_handler = get_async_httpx_client(
+        async_handler = client or get_async_httpx_client(
             llm_provider=litellm.LlmProviders.ANTHROPIC
         )
 
@@ -937,7 +426,17 @@ class AnthropicChatCompletion(BaseLLM):
                 original_response=str(e),
                 additional_args={"complete_input_dict": data},
             )
-            raise e
+            status_code = getattr(e, "status_code", 500)
+            error_headers = getattr(e, "headers", None)
+            error_text = getattr(e, "text", str(e))
+            error_response = getattr(e, "response", None)
+            if error_headers is None and error_response:
+                error_headers = getattr(error_response, "headers", None)
+            raise AnthropicError(
+                message=error_text,
+                status_code=status_code,
+                headers=error_headers,
+            )
 
         return self._process_response(
             model=model,
@@ -977,73 +476,18 @@ class AnthropicChatCompletion(BaseLLM):
         _is_function_call = False
         messages = copy.deepcopy(messages)
         optional_params = copy.deepcopy(optional_params)
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"],
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],
-                final_prompt_value=model_prompt_details["final_prompt_value"],
-                messages=messages,
-            )
-        else:
-            # Separate system prompt from rest of message
-            anthropic_system_message_list = AnthropicConfig().translate_system_message(
-                messages=messages
-            )
-            # Handling anthropic API Prompt Caching
-            if len(anthropic_system_message_list) > 0:
-                optional_params["system"] = anthropic_system_message_list
-            # Format rest of message according to anthropic guidelines
-            try:
-                messages = prompt_factory(
-                    model=model, messages=messages, custom_llm_provider="anthropic"
-                )
-            except Exception as e:
-                raise AnthropicError(
-                    status_code=400,
-                    message="{}\nReceived Messages={}".format(str(e), messages),
-                )  # don't use verbose_logger.exception, if exception is raised
-
-        ## Load Config
-        config = litellm.AnthropicConfig.get_config()
-        for k, v in config.items():
-            if (
-                k not in optional_params
-            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                optional_params[k] = v
-
-        ## Handle Tool Calling
-        if "tools" in optional_params:
-            _is_function_call = True
-            if "anthropic-beta" not in headers:
-                # default to v1 of "anthropic-beta"
-                headers["anthropic-beta"] = "tools-2024-05-16"
-
-            anthropic_tools = []
-            for tool in optional_params["tools"]:
-                if "input_schema" in tool:  # assume in anthropic format
-                    anthropic_tools.append(tool)
-                else:  # assume openai tool call
-                    new_tool = tool["function"]
-                    new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
-                    if "cache_control" in tool:
-                        new_tool["cache_control"] = tool["cache_control"]
-                    anthropic_tools.append(new_tool)
-
-            optional_params["tools"] = anthropic_tools
-
         stream = optional_params.pop("stream", None)
-        is_vertex_request: bool = optional_params.pop("is_vertex_request", False)
         json_mode: bool = optional_params.pop("json_mode", False)
+        is_vertex_request: bool = optional_params.pop("is_vertex_request", False)
 
-        data = {
-            "messages": messages,
-            **optional_params,
-        }
-
-        if is_vertex_request is False:
-            data["model"] = model
+        data = AnthropicConfig()._transform_request(
+            model=model,
+            messages=messages,
+            optional_params=optional_params,
+            headers=headers,
+            _is_function_call=_is_function_call,
+            is_vertex_request=is_vertex_request,
+        )
 
         ## LOGGING
         logging_obj.pre_call(
@@ -1136,12 +580,25 @@ class AnthropicChatCompletion(BaseLLM):
                     client = HTTPHandler(timeout=timeout)  # type: ignore
                 else:
                     client = client
-                response = client.post(
-                    api_base, headers=headers, data=json.dumps(data), timeout=timeout
-                )
-                if response.status_code != 200:
+
+                try:
+                    response = client.post(
+                        api_base,
+                        headers=headers,
+                        data=json.dumps(data),
+                        timeout=timeout,
+                    )
+                except Exception as e:
+                    status_code = getattr(e, "status_code", 500)
+                    error_headers = getattr(e, "headers", None)
+                    error_text = getattr(e, "text", str(e))
+                    error_response = getattr(e, "response", None)
+                    if error_headers is None and error_response:
+                        error_headers = getattr(error_response, "headers", None)
                     raise AnthropicError(
-                        status_code=response.status_code, message=response.text
+                        message=error_text,
+                        status_code=status_code,
+                        headers=error_headers,
                     )
 
         return self._process_response(
@@ -1151,7 +608,7 @@ class AnthropicChatCompletion(BaseLLM):
             stream=stream,
             logging_obj=logging_obj,
             api_key=api_key,
-            data=data,
+            data=data,  # type: ignore
             messages=messages,
             print_verbose=print_verbose,
             optional_params=optional_params,
@@ -1192,7 +649,7 @@ class ModelResponseIterator:
         return False
 
     def _handle_usage(
-        self, anthropic_usage_chunk: dict
+        self, anthropic_usage_chunk: Union[dict, UsageDelta]
     ) -> AnthropicChatCompletionUsageBlock:
         special_fields = ["input_tokens", "output_tokens"]
 
@@ -1203,15 +660,19 @@ class ModelResponseIterator:
             + anthropic_usage_chunk.get("output_tokens", 0),
         )
 
-        if "cache_creation_input_tokens" in anthropic_usage_chunk:
-            usage_block["cache_creation_input_tokens"] = anthropic_usage_chunk[
-                "cache_creation_input_tokens"
-            ]
+        cache_creation_input_tokens = anthropic_usage_chunk.get(
+            "cache_creation_input_tokens"
+        )
+        if cache_creation_input_tokens is not None and isinstance(
+            cache_creation_input_tokens, int
+        ):
+            usage_block["cache_creation_input_tokens"] = cache_creation_input_tokens
 
-        if "cache_read_input_tokens" in anthropic_usage_chunk:
-            usage_block["cache_read_input_tokens"] = anthropic_usage_chunk[
-                "cache_read_input_tokens"
-            ]
+        cache_read_input_tokens = anthropic_usage_chunk.get("cache_read_input_tokens")
+        if cache_read_input_tokens is not None and isinstance(
+            cache_read_input_tokens, int
+        ):
+            usage_block["cache_read_input_tokens"] = cache_read_input_tokens
 
         return usage_block
 
@@ -1313,9 +774,10 @@ class ModelResponseIterator:
                 }
                 """
                 message_start_block = MessageStartBlock(**chunk)  # type: ignore
-                usage = self._handle_usage(
-                    anthropic_usage_chunk=message_start_block["message"]["usage"]
-                )
+                if "usage" in message_start_block["message"]:
+                    usage = self._handle_usage(
+                        anthropic_usage_chunk=message_start_block["message"]["usage"]
+                    )
             elif type_chunk == "error":
                 """
                 {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}      }
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
new file mode 100644
index 000000000..2ca22db3b
--- /dev/null
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -0,0 +1,289 @@
+import types
+from typing import List, Literal, Optional, Tuple, Union
+
+import litellm
+from litellm.llms.prompt_templates.factory import anthropic_messages_pt
+from litellm.types.llms.anthropic import (
+    AnthropicMessageRequestBase,
+    AnthropicMessagesRequest,
+    AnthropicMessagesToolChoice,
+    AnthropicSystemMessageContent,
+)
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionSystemMessage
+from litellm.utils import has_tool_call_blocks
+
+from ..common_utils import AnthropicError
+
+
+class AnthropicConfig:
+    """
+    Reference: https://docs.anthropic.com/claude/reference/messages_post
+
+    to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
+    """
+
+    max_tokens: Optional[int] = (
+        4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+    )
+    stop_sequences: Optional[list] = None
+    temperature: Optional[int] = None
+    top_p: Optional[int] = None
+    top_k: Optional[int] = None
+    metadata: Optional[dict] = None
+    system: Optional[str] = None
+
+    def __init__(
+        self,
+        max_tokens: Optional[
+            int
+        ] = 4096,  # You can pass in a value yourself or use the default value 4096
+        stop_sequences: Optional[list] = None,
+        temperature: Optional[int] = None,
+        top_p: Optional[int] = None,
+        top_k: Optional[int] = None,
+        metadata: Optional[dict] = None,
+        system: Optional[str] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "stream",
+            "stop",
+            "temperature",
+            "top_p",
+            "max_tokens",
+            "max_completion_tokens",
+            "tools",
+            "tool_choice",
+            "extra_headers",
+        ]
+
+    def get_cache_control_headers(self) -> dict:
+        return {
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "prompt-caching-2024-07-31",
+        }
+
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        messages: Optional[List[AllMessageValues]] = None,
+    ):
+        for param, value in non_default_params.items():
+            if param == "max_tokens":
+                optional_params["max_tokens"] = value
+            if param == "max_completion_tokens":
+                optional_params["max_tokens"] = value
+            if param == "tools":
+                optional_params["tools"] = value
+            if param == "tool_choice":
+                _tool_choice: Optional[AnthropicMessagesToolChoice] = None
+                if value == "auto":
+                    _tool_choice = {"type": "auto"}
+                elif value == "required":
+                    _tool_choice = {"type": "any"}
+                elif isinstance(value, dict):
+                    _tool_choice = {"type": "tool", "name": value["function"]["name"]}
+
+                if _tool_choice is not None:
+                    optional_params["tool_choice"] = _tool_choice
+            if param == "stream" and value is True:
+                optional_params["stream"] = value
+            if param == "stop":
+                if isinstance(value, str):
+                    if (
+                        value == "\n"
+                    ) and litellm.drop_params is True:  # anthropic doesn't allow whitespace characters as stop-sequences
+                        continue
+                    value = [value]
+                elif isinstance(value, list):
+                    new_v = []
+                    for v in value:
+                        if (
+                            v == "\n"
+                        ) and litellm.drop_params is True:  # anthropic doesn't allow whitespace characters as stop-sequences
+                            continue
+                        new_v.append(v)
+                    if len(new_v) > 0:
+                        value = new_v
+                    else:
+                        continue
+                optional_params["stop_sequences"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "top_p":
+                optional_params["top_p"] = value
+
+        ## VALIDATE REQUEST
+        """
+        Anthropic doesn't support tool calling without `tools=` param specified.
+        """
+        if (
+            "tools" not in non_default_params
+            and messages is not None
+            and has_tool_call_blocks(messages)
+        ):
+            raise litellm.UnsupportedParamsError(
+                message="Anthropic doesn't support tool calling without `tools=` param specified. Pass `tools=` param to enable tool calling.",
+                model="",
+                llm_provider="anthropic",
+            )
+
+        return optional_params
+
+    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Return if {"cache_control": ..} in message content block
+
+        Used to check if anthropic prompt caching headers need to be set.
+        """
+        for message in messages:
+            _message_content = message.get("content")
+            if _message_content is not None and isinstance(_message_content, list):
+                for content in _message_content:
+                    if "cache_control" in content:
+                        return True
+
+        return False
+
+    def translate_system_message(
+        self, messages: List[AllMessageValues]
+    ) -> List[AnthropicSystemMessageContent]:
+        """
+        Translate system message to anthropic format.
+
+        Removes system message from the original list and returns a new list of anthropic system message content.
+        """
+        system_prompt_indices = []
+        anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
+        for idx, message in enumerate(messages):
+            if message["role"] == "system":
+                valid_content: bool = False
+                system_message_block = ChatCompletionSystemMessage(**message)
+                if isinstance(system_message_block["content"], str):
+                    anthropic_system_message_content = AnthropicSystemMessageContent(
+                        type="text",
+                        text=system_message_block["content"],
+                    )
+                    if "cache_control" in system_message_block:
+                        anthropic_system_message_content["cache_control"] = (
+                            system_message_block["cache_control"]
+                        )
+                    anthropic_system_message_list.append(
+                        anthropic_system_message_content
+                    )
+                    valid_content = True
+                elif isinstance(message["content"], list):
+                    for _content in message["content"]:
+                        anthropic_system_message_content = (
+                            AnthropicSystemMessageContent(
+                                type=_content.get("type"),
+                                text=_content.get("text"),
+                            )
+                        )
+                        if "cache_control" in _content:
+                            anthropic_system_message_content["cache_control"] = (
+                                _content["cache_control"]
+                            )
+
+                        anthropic_system_message_list.append(
+                            anthropic_system_message_content
+                        )
+                    valid_content = True
+
+                if valid_content:
+                    system_prompt_indices.append(idx)
+        if len(system_prompt_indices) > 0:
+            for idx in reversed(system_prompt_indices):
+                messages.pop(idx)
+
+        return anthropic_system_message_list
+
+    def _transform_request(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        headers: dict,
+        _is_function_call: bool,
+        is_vertex_request: bool,
+    ) -> dict:
+        """
+        Translate messages to anthropic format.
+        """
+        # Separate system prompt from rest of message
+        anthropic_system_message_list = self.translate_system_message(messages=messages)
+        # Handling anthropic API Prompt Caching
+        if len(anthropic_system_message_list) > 0:
+            optional_params["system"] = anthropic_system_message_list
+        # Format rest of message according to anthropic guidelines
+        try:
+            anthropic_messages = anthropic_messages_pt(
+                model=model,
+                messages=messages,
+                llm_provider="anthropic",
+            )
+        except Exception as e:
+            raise AnthropicError(
+                status_code=400,
+                message="{}\nReceived Messages={}".format(str(e), messages),
+            )  # don't use verbose_logger.exception, if exception is raised
+
+        ## Load Config
+        config = litellm.AnthropicConfig.get_config()
+        for k, v in config.items():
+            if (
+                k not in optional_params
+            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        ## Handle Tool Calling
+        if "tools" in optional_params:
+            _is_function_call = True
+            if "anthropic-beta" not in headers:
+                # default to v1 of "anthropic-beta"
+                headers["anthropic-beta"] = "tools-2024-05-16"
+
+            anthropic_tools = []
+            for tool in optional_params["tools"]:
+                if "input_schema" in tool:  # assume in anthropic format
+                    anthropic_tools.append(tool)
+                else:  # assume openai tool call
+                    new_tool = tool["function"]
+                    new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
+                    if "cache_control" in tool:
+                        new_tool["cache_control"] = tool["cache_control"]
+                    anthropic_tools.append(new_tool)
+
+            optional_params["tools"] = anthropic_tools
+
+        data = {
+            "messages": anthropic_messages,
+            **optional_params,
+        }
+        if not is_vertex_request:
+            data["model"] = model
+        return data
diff --git a/litellm/llms/anthropic/common_utils.py b/litellm/llms/anthropic/common_utils.py
new file mode 100644
index 000000000..f7cba3e4a
--- /dev/null
+++ b/litellm/llms/anthropic/common_utils.py
@@ -0,0 +1,26 @@
+"""
+This file contains common utils for anthropic calls.
+"""
+
+from typing import Optional
+
+import httpx
+
+
+class AnthropicError(Exception):
+    def __init__(
+        self,
+        status_code: int,
+        message,
+        headers: Optional[httpx.Headers] = None,
+    ):
+        self.status_code = status_code
+        self.message: str = message
+        self.headers = headers
+        self.request = httpx.Request(
+            method="POST", url="https://api.anthropic.com/v1/messages"
+        )
+        self.response = httpx.Response(status_code=status_code, request=self.request)
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
diff --git a/litellm/llms/anthropic/experimental_pass_through/transformation.py b/litellm/llms/anthropic/experimental_pass_through/transformation.py
new file mode 100644
index 000000000..2a82594ba
--- /dev/null
+++ b/litellm/llms/anthropic/experimental_pass_through/transformation.py
@@ -0,0 +1,425 @@
+import json
+import types
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice
+
+import litellm
+from litellm.types.llms.anthropic import (
+    AnthopicMessagesAssistantMessageParam,
+    AnthropicChatCompletionUsageBlock,
+    AnthropicFinishReason,
+    AnthropicMessagesRequest,
+    AnthropicMessagesTool,
+    AnthropicMessagesToolChoice,
+    AnthropicMessagesUserMessageParam,
+    AnthropicResponse,
+    AnthropicResponseContentBlockText,
+    AnthropicResponseContentBlockToolUse,
+    AnthropicResponseUsageBlock,
+    AnthropicSystemMessageContent,
+    ContentBlockDelta,
+    ContentBlockStart,
+    ContentBlockStop,
+    ContentJsonBlockDelta,
+    ContentTextBlockDelta,
+    MessageBlockDelta,
+    MessageDelta,
+    MessageStartBlock,
+    UsageDelta,
+)
+from litellm.types.llms.openai import (
+    AllMessageValues,
+    ChatCompletionAssistantMessage,
+    ChatCompletionAssistantToolCall,
+    ChatCompletionImageObject,
+    ChatCompletionImageUrlObject,
+    ChatCompletionRequest,
+    ChatCompletionResponseMessage,
+    ChatCompletionSystemMessage,
+    ChatCompletionTextObject,
+    ChatCompletionToolCallChunk,
+    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionToolChoiceFunctionParam,
+    ChatCompletionToolChoiceObjectParam,
+    ChatCompletionToolChoiceValues,
+    ChatCompletionToolMessage,
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+    ChatCompletionUsageBlock,
+    ChatCompletionUserMessage,
+    OpenAIMessageContent,
+)
+from litellm.types.utils import Choices, GenericStreamingChunk
+from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
+
+from ...base import BaseLLM
+from ...prompt_templates.factory import (
+    anthropic_messages_pt,
+    custom_prompt,
+    prompt_factory,
+)
+
+
+class AnthropicExperimentalPassThroughConfig:
+    def __init__(self):
+        pass
+
+    ### FOR [BETA] `/v1/messages` endpoint support
+
+    def translatable_anthropic_params(self) -> List:
+        """
+        Which anthropic params, we need to translate to the openai format.
+        """
+        return ["messages", "metadata", "system", "tool_choice", "tools"]
+
+    def translate_anthropic_messages_to_openai(
+        self,
+        messages: List[
+            Union[
+                AnthropicMessagesUserMessageParam,
+                AnthopicMessagesAssistantMessageParam,
+            ]
+        ],
+    ) -> List:
+        new_messages: List[AllMessageValues] = []
+        for m in messages:
+            user_message: Optional[ChatCompletionUserMessage] = None
+            tool_message_list: List[ChatCompletionToolMessage] = []
+            new_user_content_list: List[
+                Union[ChatCompletionTextObject, ChatCompletionImageObject]
+            ] = []
+            ## USER MESSAGE ##
+            if m["role"] == "user":
+                ## translate user message
+                message_content = m.get("content")
+                if message_content and isinstance(message_content, str):
+                    user_message = ChatCompletionUserMessage(
+                        role="user", content=message_content
+                    )
+                elif message_content and isinstance(message_content, list):
+                    for content in message_content:
+                        if content["type"] == "text":
+                            text_obj = ChatCompletionTextObject(
+                                type="text", text=content["text"]
+                            )
+                            new_user_content_list.append(text_obj)
+                        elif content["type"] == "image":
+                            image_url = ChatCompletionImageUrlObject(
+                                url=f"data:{content['type']};base64,{content['source']}"
+                            )
+                            image_obj = ChatCompletionImageObject(
+                                type="image_url", image_url=image_url
+                            )
+
+                            new_user_content_list.append(image_obj)
+                        elif content["type"] == "tool_result":
+                            if "content" not in content:
+                                tool_result = ChatCompletionToolMessage(
+                                    role="tool",
+                                    tool_call_id=content["tool_use_id"],
+                                    content="",
+                                )
+                                tool_message_list.append(tool_result)
+                            elif isinstance(content["content"], str):
+                                tool_result = ChatCompletionToolMessage(
+                                    role="tool",
+                                    tool_call_id=content["tool_use_id"],
+                                    content=content["content"],
+                                )
+                                tool_message_list.append(tool_result)
+                            elif isinstance(content["content"], list):
+                                for c in content["content"]:
+                                    if c["type"] == "text":
+                                        tool_result = ChatCompletionToolMessage(
+                                            role="tool",
+                                            tool_call_id=content["tool_use_id"],
+                                            content=c["text"],
+                                        )
+                                        tool_message_list.append(tool_result)
+                                    elif c["type"] == "image":
+                                        image_str = (
+                                            f"data:{c['type']};base64,{c['source']}"
+                                        )
+                                        tool_result = ChatCompletionToolMessage(
+                                            role="tool",
+                                            tool_call_id=content["tool_use_id"],
+                                            content=image_str,
+                                        )
+                                        tool_message_list.append(tool_result)
+
+            if user_message is not None:
+                new_messages.append(user_message)
+
+            if len(new_user_content_list) > 0:
+                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
+
+            if len(tool_message_list) > 0:
+                new_messages.extend(tool_message_list)
+
+            ## ASSISTANT MESSAGE ##
+            assistant_message_str: Optional[str] = None
+            tool_calls: List[ChatCompletionAssistantToolCall] = []
+            if m["role"] == "assistant":
+                if isinstance(m["content"], str):
+                    assistant_message_str = m["content"]
+                elif isinstance(m["content"], list):
+                    for content in m["content"]:
+                        if content["type"] == "text":
+                            if assistant_message_str is None:
+                                assistant_message_str = content["text"]
+                            else:
+                                assistant_message_str += content["text"]
+                        elif content["type"] == "tool_use":
+                            function_chunk = ChatCompletionToolCallFunctionChunk(
+                                name=content["name"],
+                                arguments=json.dumps(content["input"]),
+                            )
+
+                            tool_calls.append(
+                                ChatCompletionAssistantToolCall(
+                                    id=content["id"],
+                                    type="function",
+                                    function=function_chunk,
+                                )
+                            )
+
+            if assistant_message_str is not None or len(tool_calls) > 0:
+                assistant_message = ChatCompletionAssistantMessage(
+                    role="assistant",
+                    content=assistant_message_str,
+                )
+                if len(tool_calls) > 0:
+                    assistant_message["tool_calls"] = tool_calls
+                new_messages.append(assistant_message)
+
+        return new_messages
+
+    def translate_anthropic_tool_choice_to_openai(
+        self, tool_choice: AnthropicMessagesToolChoice
+    ) -> ChatCompletionToolChoiceValues:
+        if tool_choice["type"] == "any":
+            return "required"
+        elif tool_choice["type"] == "auto":
+            return "auto"
+        elif tool_choice["type"] == "tool":
+            tc_function_param = ChatCompletionToolChoiceFunctionParam(
+                name=tool_choice.get("name", "")
+            )
+            return ChatCompletionToolChoiceObjectParam(
+                type="function", function=tc_function_param
+            )
+        else:
+            raise ValueError(
+                "Incompatible tool choice param submitted - {}".format(tool_choice)
+            )
+
+    def translate_anthropic_tools_to_openai(
+        self, tools: List[AnthropicMessagesTool]
+    ) -> List[ChatCompletionToolParam]:
+        new_tools: List[ChatCompletionToolParam] = []
+        for tool in tools:
+            function_chunk = ChatCompletionToolParamFunctionChunk(
+                name=tool["name"],
+                parameters=tool["input_schema"],
+            )
+            if "description" in tool:
+                function_chunk["description"] = tool["description"]
+            new_tools.append(
+                ChatCompletionToolParam(type="function", function=function_chunk)
+            )
+
+        return new_tools
+
+    def translate_anthropic_to_openai(
+        self, anthropic_message_request: AnthropicMessagesRequest
+    ) -> ChatCompletionRequest:
+        """
+        This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
+        """
+        new_messages: List[AllMessageValues] = []
+
+        ## CONVERT ANTHROPIC MESSAGES TO OPENAI
+        new_messages = self.translate_anthropic_messages_to_openai(
+            messages=anthropic_message_request["messages"]
+        )
+        ## ADD SYSTEM MESSAGE TO MESSAGES
+        if "system" in anthropic_message_request:
+            new_messages.insert(
+                0,
+                ChatCompletionSystemMessage(
+                    role="system", content=anthropic_message_request["system"]
+                ),
+            )
+
+        new_kwargs: ChatCompletionRequest = {
+            "model": anthropic_message_request["model"],
+            "messages": new_messages,
+        }
+        ## CONVERT METADATA (user_id)
+        if "metadata" in anthropic_message_request:
+            if "user_id" in anthropic_message_request["metadata"]:
+                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
+
+        # Pass litellm proxy specific metadata
+        if "litellm_metadata" in anthropic_message_request:
+            # metadata will be passed to litellm.acompletion(), it's a litellm_param
+            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
+
+        ## CONVERT TOOL CHOICE
+        if "tool_choice" in anthropic_message_request:
+            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
+                tool_choice=anthropic_message_request["tool_choice"]
+            )
+        ## CONVERT TOOLS
+        if "tools" in anthropic_message_request:
+            new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
+                tools=anthropic_message_request["tools"]
+            )
+
+        translatable_params = self.translatable_anthropic_params()
+        for k, v in anthropic_message_request.items():
+            if k not in translatable_params:  # pass remaining params as is
+                new_kwargs[k] = v  # type: ignore
+
+        return new_kwargs
+
+    def _translate_openai_content_to_anthropic(
+        self, choices: List[Choices]
+    ) -> List[
+        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
+    ]:
+        new_content: List[
+            Union[
+                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
+            ]
+        ] = []
+        for choice in choices:
+            if (
+                choice.message.tool_calls is not None
+                and len(choice.message.tool_calls) > 0
+            ):
+                for tool_call in choice.message.tool_calls:
+                    new_content.append(
+                        AnthropicResponseContentBlockToolUse(
+                            type="tool_use",
+                            id=tool_call.id,
+                            name=tool_call.function.name or "",
+                            input=json.loads(tool_call.function.arguments),
+                        )
+                    )
+            elif choice.message.content is not None:
+                new_content.append(
+                    AnthropicResponseContentBlockText(
+                        type="text", text=choice.message.content
+                    )
+                )
+
+        return new_content
+
+    def _translate_openai_finish_reason_to_anthropic(
+        self, openai_finish_reason: str
+    ) -> AnthropicFinishReason:
+        if openai_finish_reason == "stop":
+            return "end_turn"
+        elif openai_finish_reason == "length":
+            return "max_tokens"
+        elif openai_finish_reason == "tool_calls":
+            return "tool_use"
+        return "end_turn"
+
+    def translate_openai_response_to_anthropic(
+        self, response: litellm.ModelResponse
+    ) -> AnthropicResponse:
+        ## translate content block
+        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
+        ## extract finish reason
+        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
+            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
+        )
+        # extract usage
+        usage: litellm.Usage = getattr(response, "usage")
+        anthropic_usage = AnthropicResponseUsageBlock(
+            input_tokens=usage.prompt_tokens or 0,
+            output_tokens=usage.completion_tokens or 0,
+        )
+        translated_obj = AnthropicResponse(
+            id=response.id,
+            type="message",
+            role="assistant",
+            model=response.model or "unknown-model",
+            stop_sequence=None,
+            usage=anthropic_usage,
+            content=anthropic_content,
+            stop_reason=anthropic_finish_reason,
+        )
+
+        return translated_obj
+
+    def _translate_streaming_openai_chunk_to_anthropic(
+        self, choices: List[OpenAIStreamingChoice]
+    ) -> Tuple[
+        Literal["text_delta", "input_json_delta"],
+        Union[ContentTextBlockDelta, ContentJsonBlockDelta],
+    ]:
+        text: str = ""
+        partial_json: Optional[str] = None
+        for choice in choices:
+            if choice.delta.content is not None:
+                text += choice.delta.content
+            elif choice.delta.tool_calls is not None:
+                partial_json = ""
+                for tool in choice.delta.tool_calls:
+                    if (
+                        tool.function is not None
+                        and tool.function.arguments is not None
+                    ):
+                        partial_json += tool.function.arguments
+
+        if partial_json is not None:
+            return "input_json_delta", ContentJsonBlockDelta(
+                type="input_json_delta", partial_json=partial_json
+            )
+        else:
+            return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
+
+    def translate_streaming_openai_response_to_anthropic(
+        self, response: litellm.ModelResponse
+    ) -> Union[ContentBlockDelta, MessageBlockDelta]:
+        ## base case - final chunk w/ finish reason
+        if response.choices[0].finish_reason is not None:
+            delta = MessageDelta(
+                stop_reason=self._translate_openai_finish_reason_to_anthropic(
+                    response.choices[0].finish_reason
+                ),
+            )
+            if getattr(response, "usage", None) is not None:
+                litellm_usage_chunk: Optional[litellm.Usage] = response.usage  # type: ignore
+            elif (
+                hasattr(response, "_hidden_params")
+                and "usage" in response._hidden_params
+            ):
+                litellm_usage_chunk = response._hidden_params["usage"]
+            else:
+                litellm_usage_chunk = None
+            if litellm_usage_chunk is not None:
+                usage_delta = UsageDelta(
+                    input_tokens=litellm_usage_chunk.prompt_tokens or 0,
+                    output_tokens=litellm_usage_chunk.completion_tokens or 0,
+                )
+            else:
+                usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
+            return MessageBlockDelta(
+                type="message_delta", delta=delta, usage=usage_delta
+            )
+        (
+            type_of_content,
+            content_block_delta,
+        ) = self._translate_streaming_openai_chunk_to_anthropic(
+            choices=response.choices  # type: ignore
+        )
+        return ContentBlockDelta(
+            type="content_block_delta",
+            index=response.choices[0].index,
+            delta=content_block_delta,
+        )
diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py
index 8229f6a58..77946bfb6 100644
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@@ -22,7 +22,7 @@ from litellm.types.llms.openai import (
     ChatCompletionToolParamFunctionChunk,
 )
 from litellm.types.utils import ModelResponse, Usage
-from litellm.utils import CustomStreamWrapper
+from litellm.utils import CustomStreamWrapper, has_tool_call_blocks
 
 from ...prompt_templates.factory import _bedrock_converse_messages_pt, _bedrock_tools_pt
 from ..common_utils import BedrockError, get_bedrock_tool_name
@@ -136,6 +136,7 @@ class AmazonConverseConfig:
         non_default_params: dict,
         optional_params: dict,
         drop_params: bool,
+        messages: Optional[List[AllMessageValues]] = None,
     ) -> dict:
         for param, value in non_default_params.items():
             if param == "response_format":
@@ -202,6 +203,21 @@ class AmazonConverseConfig:
                 )
                 if _tool_choice_value is not None:
                     optional_params["tool_choice"] = _tool_choice_value
+
+        ## VALIDATE REQUEST
+        """
+        Bedrock doesn't support tool calling without `tools=` param specified.
+        """
+        if (
+            "tools" not in non_default_params
+            and messages is not None
+            and has_tool_call_blocks(messages)
+        ):
+            raise litellm.UnsupportedParamsError(
+                message="Anthropic doesn't support tool calling without `tools=` param specified. Pass `tools=` param to enable tool calling.",
+                model="",
+                llm_provider="anthropic",
+            )
         return optional_params
 
     def _transform_request(
diff --git a/litellm/llms/groq/chat/handler.py b/litellm/llms/groq/chat/handler.py
new file mode 100644
index 000000000..f4a16abc8
--- /dev/null
+++ b/litellm/llms/groq/chat/handler.py
@@ -0,0 +1,60 @@
+"""
+Handles the chat completion request for groq
+"""
+
+from typing import Any, Callable, Optional, Union
+
+from httpx._config import Timeout
+
+from litellm.utils import ModelResponse
+
+from ...groq.chat.transformation import GroqChatConfig
+from ...OpenAI.openai import OpenAIChatCompletion
+
+
+class GroqChatCompletion(OpenAIChatCompletion):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def completion(
+        self,
+        model_response: ModelResponse,
+        timeout: Union[float, Timeout],
+        optional_params: dict,
+        logging_obj: Any,
+        model: Optional[str] = None,
+        messages: Optional[list] = None,
+        print_verbose: Optional[Callable[..., Any]] = None,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        acompletion: bool = False,
+        litellm_params=None,
+        logger_fn=None,
+        headers: Optional[dict] = None,
+        custom_prompt_dict: dict = {},
+        client=None,
+        organization: Optional[str] = None,
+        custom_llm_provider: Optional[str] = None,
+        drop_params: Optional[bool] = None,
+    ):
+        messages = GroqChatConfig()._transform_messages(messages)  # type: ignore
+        return super().completion(
+            model_response,
+            timeout,
+            optional_params,
+            logging_obj,
+            model,
+            messages,
+            print_verbose,
+            api_key,
+            api_base,
+            acompletion,
+            litellm_params,
+            logger_fn,
+            headers,
+            custom_prompt_dict,
+            client,
+            organization,
+            custom_llm_provider,
+            drop_params,
+        )
diff --git a/litellm/llms/groq/chat/transformation.py b/litellm/llms/groq/chat/transformation.py
new file mode 100644
index 000000000..c683130ef
--- /dev/null
+++ b/litellm/llms/groq/chat/transformation.py
@@ -0,0 +1,88 @@
+"""
+Translate from OpenAI's `/v1/chat/completions` to Groq's `/v1/chat/completions`
+"""
+
+import types
+from typing import List, Optional, Union
+
+from pydantic import BaseModel
+
+import litellm
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage
+
+from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig
+
+
+class GroqChatConfig(OpenAIGPTConfig):
+
+    frequency_penalty: Optional[int] = None
+    function_call: Optional[Union[str, dict]] = None
+    functions: Optional[list] = None
+    logit_bias: Optional[dict] = None
+    max_tokens: Optional[int] = None
+    n: Optional[int] = None
+    presence_penalty: Optional[int] = None
+    stop: Optional[Union[str, list]] = None
+    temperature: Optional[int] = None
+    top_p: Optional[int] = None
+    response_format: Optional[dict] = None
+    tools: Optional[list] = None
+    tool_choice: Optional[Union[str, dict]] = None
+
+    def __init__(
+        self,
+        frequency_penalty: Optional[int] = None,
+        function_call: Optional[Union[str, dict]] = None,
+        functions: Optional[list] = None,
+        logit_bias: Optional[dict] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[int] = None,
+        stop: Optional[Union[str, list]] = None,
+        temperature: Optional[int] = None,
+        top_p: Optional[int] = None,
+        response_format: Optional[dict] = None,
+        tools: Optional[list] = None,
+        tool_choice: Optional[Union[str, dict]] = None,
+    ) -> None:
+        locals_ = locals().copy()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def _transform_messages(self, messages: List[AllMessageValues]) -> List:
+        for idx, message in enumerate(messages):
+            """
+            1. Don't pass 'null' function_call assistant message to groq - https://github.com/BerriAI/litellm/issues/5839
+            """
+            if isinstance(message, BaseModel):
+                _message = message.model_dump()
+            else:
+                _message = message
+            assistant_message = _message.get("role") == "assistant"
+            if assistant_message:
+                new_message = ChatCompletionAssistantMessage(role="assistant")
+                for k, v in _message.items():
+                    if v is not None:
+                        new_message[k] = v  # type: ignore
+                messages[idx] = new_message
+
+        return messages
diff --git a/litellm/llms/groq/stt/transformation.py b/litellm/llms/groq/stt/transformation.py
new file mode 100644
index 000000000..c4dbd8d0c
--- /dev/null
+++ b/litellm/llms/groq/stt/transformation.py
@@ -0,0 +1,101 @@
+"""
+Translate from OpenAI's `/v1/audio/transcriptions` to Groq's `/v1/audio/transcriptions`
+"""
+
+import types
+from typing import List, Optional, Union
+
+import litellm
+
+
+class GroqSTTConfig:
+
+    frequency_penalty: Optional[int] = None
+    function_call: Optional[Union[str, dict]] = None
+    functions: Optional[list] = None
+    logit_bias: Optional[dict] = None
+    max_tokens: Optional[int] = None
+    n: Optional[int] = None
+    presence_penalty: Optional[int] = None
+    stop: Optional[Union[str, list]] = None
+    temperature: Optional[int] = None
+    top_p: Optional[int] = None
+    response_format: Optional[dict] = None
+    tools: Optional[list] = None
+    tool_choice: Optional[Union[str, dict]] = None
+
+    def __init__(
+        self,
+        frequency_penalty: Optional[int] = None,
+        function_call: Optional[Union[str, dict]] = None,
+        functions: Optional[list] = None,
+        logit_bias: Optional[dict] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[int] = None,
+        stop: Optional[Union[str, list]] = None,
+        temperature: Optional[int] = None,
+        top_p: Optional[int] = None,
+        response_format: Optional[dict] = None,
+        tools: Optional[list] = None,
+        tool_choice: Optional[Union[str, dict]] = None,
+    ) -> None:
+        locals_ = locals().copy()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params_stt(self):
+        return [
+            "prompt",
+            "response_format",
+            "temperature",
+            "language",
+        ]
+
+    def get_supported_openai_response_formats_stt(self) -> List[str]:
+        return ["json", "verbose_json", "text"]
+
+    def map_openai_params_stt(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        response_formats = self.get_supported_openai_response_formats_stt()
+        for param, value in non_default_params.items():
+            if param == "response_format":
+                if value in response_formats:
+                    optional_params[param] = value
+                else:
+                    if litellm.drop_params is True or drop_params is True:
+                        pass
+                    else:
+                        raise litellm.utils.UnsupportedParamsError(
+                            message="Groq doesn't support response_format={}. To drop unsupported openai params from the call, set `litellm.drop_params = True`".format(
+                                value
+                            ),
+                            status_code=400,
+                        )
+            else:
+                optional_params[param] = value
+        return optional_params
diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
index ecb11e1c9..b67a3c433 100644
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
@@ -276,7 +276,7 @@ def completion(
 
     from anthropic import AnthropicVertex
 
-    from litellm.llms.anthropic.chat import AnthropicChatCompletion
+    from litellm.llms.anthropic.chat.handler import AnthropicChatCompletion
     from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
         VertexLLM,
     )
@@ -367,7 +367,7 @@ async def async_completion(
 
     if client is None:
         vertex_ai_client = AsyncAnthropicVertex(
-            project_id=vertex_project, region=vertex_location, access_token=access_token
+            project_id=vertex_project, region=vertex_location, access_token=access_token  # type: ignore
         )
     else:
         vertex_ai_client = client
@@ -438,7 +438,7 @@ async def async_streaming(
 
     if client is None:
         vertex_ai_client = AsyncAnthropicVertex(
-            project_id=vertex_project, region=vertex_location, access_token=access_token
+            project_id=vertex_project, region=vertex_location, access_token=access_token  # type: ignore
         )
     else:
         vertex_ai_client = client
diff --git a/litellm/main.py b/litellm/main.py
index ff9ca81c1..c681c3b6e 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -96,6 +96,7 @@ from .llms.cohere import completion as cohere_completion  # type: ignore
 from .llms.cohere import embed as cohere_embed
 from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks.chat import DatabricksChatCompletion
+from .llms.groq.chat.handler import GroqChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
 from .llms.OpenAI.chat.o1_handler import OpenAIO1ChatCompletion
@@ -168,6 +169,7 @@ openai_text_completions = OpenAITextCompletion()
 openai_o1_chat_completions = OpenAIO1ChatCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
+groq_chat_completions = GroqChatCompletion()
 azure_ai_chat_completions = AzureAIChatCompletion()
 azure_ai_embedding = AzureAIEmbedding()
 anthropic_chat_completions = AnthropicChatCompletion()
@@ -958,6 +960,7 @@ def completion(
             extra_headers=extra_headers,
             api_version=api_version,
             parallel_tool_calls=parallel_tool_calls,
+            messages=messages,
             **non_default_params,
         )
 
@@ -1318,13 +1321,56 @@ def completion(
                     additional_args={"headers": headers},
                 )
             response = _response
+        elif custom_llm_provider == "groq":
+            api_base = (
+                api_base  # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
+                or litellm.api_base
+                or get_secret("GROQ_API_BASE")
+                or "https://api.groq.com/openai/v1"
+            )
 
+            # set API KEY
+            api_key = (
+                api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
+                or litellm.groq_key
+                or get_secret("GROQ_API_KEY")
+            )
+
+            headers = headers or litellm.headers
+
+            ## LOAD CONFIG - if set
+            config = litellm.GroqChatConfig.get_config()
+            for k, v in config.items():
+                if (
+                    k not in optional_params
+                ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
+                    optional_params[k] = v
+
+            response = groq_chat_completions.completion(
+                model=model,
+                messages=messages,
+                headers=headers,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                api_key=api_key,
+                api_base=api_base,
+                acompletion=acompletion,
+                logging_obj=logging,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                timeout=timeout,  # type: ignore
+                custom_prompt_dict=custom_prompt_dict,
+                client=client,  # pass AsyncOpenAI, OpenAI client
+                organization=organization,
+                custom_llm_provider=custom_llm_provider,
+            )
         elif (
             model in litellm.open_ai_chat_completion_models
             or custom_llm_provider == "custom_openai"
             or custom_llm_provider == "deepinfra"
             or custom_llm_provider == "perplexity"
-            or custom_llm_provider == "groq"
             or custom_llm_provider == "nvidia_nim"
             or custom_llm_provider == "cerebras"
             or custom_llm_provider == "sambanova"
@@ -1431,6 +1477,7 @@ def completion(
                     original_response=response,
                     additional_args={"headers": headers},
                 )
+
         elif (
             "replicate" in model
             or custom_llm_provider == "replicate"
@@ -2933,6 +2980,7 @@ def batch_completion(
     deployment_id=None,
     request_timeout: Optional[int] = None,
     timeout: Optional[int] = 600,
+    max_workers:Optional[int]= 100,
     # Optional liteLLM function params
     **kwargs,
 ):
@@ -2956,6 +3004,7 @@ def batch_completion(
         user (str, optional): The user string for generating completions. Defaults to "".
         deployment_id (optional): The deployment ID for generating completions. Defaults to None.
         request_timeout (int, optional): The request timeout for generating completions. Defaults to None.
+        max_workers (int,optional): The maximum number of threads to use for parallel processing.
 
     Returns:
         list: A list of completion results.
@@ -3001,7 +3050,7 @@ def batch_completion(
             for i in range(0, len(lst), n):
                 yield lst[i : i + n]
 
-        with ThreadPoolExecutor(max_workers=100) as executor:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
             for sub_batch in chunks(batch_messages, 100):
                 for message_list in sub_batch:
                     kwargs_modified = args.copy()
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 48b2a9322..e698fc5ba 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1173,6 +1173,18 @@
         "supports_function_calling": true,
         "supports_assistant_prefill": true
     },
+    "mistral/pixtral-12b-2409": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000015,
+        "output_cost_per_token": 0.00000015,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_assistant_prefill": true,
+        "supports_vision": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index d75440337..7764cf4e6 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -760,7 +760,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
 
             return _user_id_rate_limits.model_dump()
         except Exception as e:
-            verbose_proxy_logger.exception(
+            verbose_proxy_logger.debug(
                 "Parallel Request Limiter: Error getting user object", str(e)
             )
             return None
diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index 24481e9c6..a36f42187 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -389,6 +389,9 @@ async def add_litellm_data_to_request(
         user_api_key_dict=user_api_key_dict,
     )
 
+    verbose_proxy_logger.debug(
+        f"[PROXY]returned data from litellm_pre_call_utils: {data}"
+    )
     return data
 
 
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 69dc730a7..8c5f91c15 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -1466,9 +1466,6 @@ class PrismaClient:
     ):
         args_passed_in = locals()
         start_time = time.time()
-        verbose_proxy_logger.debug(
-            f"PrismaClient: get_data - args_passed_in: {args_passed_in}"
-        )
         hashed_token: Optional[str] = None
         try:
             response: Any = None
diff --git a/litellm/tests/test_alangfuse.py b/litellm/tests/test_alangfuse.py
index fa1260637..e9da35b77 100644
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@@ -1224,3 +1224,14 @@ def test_langfuse_prompt_type(prompt):
     _add_prompt_to_generation_params(
         generation_params=generation_params, clean_metadata=clean_metadata
     )
+
+
+def test_langfuse_logging_metadata():
+    from litellm.integrations.langfuse import log_requester_metadata
+
+    metadata = {"key": "value", "requester_metadata": {"key": "value"}}
+
+    got_metadata = log_requester_metadata(clean_metadata=metadata)
+    expected_metadata = {"requester_metadata": {"key": "value"}}
+
+    assert expected_metadata == got_metadata
diff --git a/litellm/tests/test_anthropic_prompt_caching.py b/litellm/tests/test_anthropic_prompt_caching.py
index 06f6916ed..2224da561 100644
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@@ -61,6 +61,7 @@ async def test_litellm_anthropic_prompt_caching_tools():
         }
 
     mock_response.json = return_val
+    mock_response.headers = {"key": "value"}
 
     litellm.set_verbose = True
     with patch(
@@ -466,6 +467,7 @@ async def test_litellm_anthropic_prompt_caching_system():
         }
 
     mock_response.json = return_val
+    mock_response.headers = {"key": "value"}
 
     litellm.set_verbose = True
     with patch(
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index f49fb6254..a51dcc693 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -24,7 +24,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import anthropic_messages_pt
 
-# litellm.num_retries = 3
+# litellm.num_retries=3
 
 litellm.cache = None
 litellm.success_callback = []
diff --git a/litellm/tests/test_custom_callback_input.py b/litellm/tests/test_custom_callback_input.py
index 504c881fa..632112f5b 100644
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@@ -1173,7 +1173,12 @@ def test_turn_off_message_logging():
 ##### VALID JSON ######
 
 
-@pytest.mark.parametrize("model", ["gpt-3.5-turbo", "azure/chatgpt-v-2"])
+@pytest.mark.parametrize(
+    "model",
+    [
+        "ft:gpt-3.5-turbo:my-org:custom_suffix:id"
+    ],  # "gpt-3.5-turbo", "azure/chatgpt-v-2",
+)
 @pytest.mark.parametrize(
     "turn_off_message_logging",
     [
@@ -1200,7 +1205,7 @@ def test_standard_logging_payload(model, turn_off_message_logging):
         _ = litellm.completion(
             model=model,
             messages=[{"role": "user", "content": "Hey, how's it going?"}],
-            # mock_response="Going well!",
+            mock_response="Going well!",
         )
 
         time.sleep(2)
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index e23285422..2794fe68b 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -7,6 +7,8 @@ from typing import Any
 
 from openai import AuthenticationError, BadRequestError, OpenAIError, RateLimitError
 
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+
 sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
@@ -884,6 +886,42 @@ def _pre_call_utils(
     return data, original_function, mapped_target
 
 
+def _pre_call_utils_httpx(
+    call_type: str,
+    data: dict,
+    client: Union[HTTPHandler, AsyncHTTPHandler],
+    sync_mode: bool,
+    streaming: Optional[bool],
+):
+    mapped_target: Any = client.client
+    if call_type == "embedding":
+        data["input"] = "Hello world!"
+
+        if sync_mode:
+            original_function = litellm.embedding
+        else:
+            original_function = litellm.aembedding
+    elif call_type == "chat_completion":
+        data["messages"] = [{"role": "user", "content": "Hello world"}]
+        if streaming is True:
+            data["stream"] = True
+
+        if sync_mode:
+            original_function = litellm.completion
+        else:
+            original_function = litellm.acompletion
+    elif call_type == "completion":
+        data["prompt"] = "Hello world"
+        if streaming is True:
+            data["stream"] = True
+        if sync_mode:
+            original_function = litellm.text_completion
+        else:
+            original_function = litellm.atext_completion
+
+    return data, original_function, mapped_target
+
+
 @pytest.mark.parametrize(
     "sync_mode",
     [True, False],
@@ -1006,3 +1044,111 @@ async def test_exception_with_headers(sync_mode, provider, model, call_type, str
         if exception_raised is False:
             print(resp)
         assert exception_raised
+
+
+@pytest.mark.parametrize(
+    "sync_mode",
+    [True, False],
+)
+@pytest.mark.parametrize("streaming", [True, False])
+@pytest.mark.parametrize(
+    "provider, model, call_type",
+    [
+        ("anthropic", "claude-3-haiku-20240307", "chat_completion"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_exception_with_headers_httpx(
+    sync_mode, provider, model, call_type, streaming
+):
+    """
+    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
+    but Azure says to retry in at most 9s
+
+    ```
+    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
+    ```
+    """
+    print(f"Received args: {locals()}")
+    import openai
+
+    if sync_mode:
+        client = HTTPHandler()
+    else:
+        client = AsyncHTTPHandler()
+
+    data = {"model": model}
+    data, original_function, mapped_target = _pre_call_utils_httpx(
+        call_type=call_type,
+        data=data,
+        client=client,
+        sync_mode=sync_mode,
+        streaming=streaming,
+    )
+
+    cooldown_time = 30.0
+
+    def _return_exception(*args, **kwargs):
+        import datetime
+
+        from httpx import Headers, HTTPStatusError, Request, Response
+
+        # Create the Request object
+        request = Request("POST", "http://0.0.0.0:9000/chat/completions")
+
+        # Create the Response object with the necessary headers and status code
+        response = Response(
+            status_code=429,
+            headers=Headers(
+                {
+                    "date": "Sat, 21 Sep 2024 22:56:53 GMT",
+                    "server": "uvicorn",
+                    "retry-after": "30",
+                    "content-length": "30",
+                    "content-type": "application/json",
+                }
+            ),
+            request=request,
+        )
+
+        # Create and raise the HTTPStatusError exception
+        raise HTTPStatusError(
+            message="Error code: 429 - Rate Limit Error!",
+            request=request,
+            response=response,
+        )
+
+    with patch.object(
+        mapped_target,
+        "send",
+        side_effect=_return_exception,
+    ):
+        new_retry_after_mock_client = MagicMock(return_value=-1)
+
+        litellm.utils._get_retry_after_from_exception_header = (
+            new_retry_after_mock_client
+        )
+
+        exception_raised = False
+        try:
+            if sync_mode:
+                resp = original_function(**data, client=client)
+                if streaming:
+                    for chunk in resp:
+                        continue
+            else:
+                resp = await original_function(**data, client=client)
+
+                if streaming:
+                    async for chunk in resp:
+                        continue
+
+        except litellm.RateLimitError as e:
+            exception_raised = True
+            assert e.litellm_response_headers is not None
+            print("e.litellm_response_headers", e.litellm_response_headers)
+            assert int(e.litellm_response_headers["retry-after"]) == cooldown_time
+
+        if exception_raised is False:
+            print(resp)
+        assert exception_raised
diff --git a/litellm/tests/test_function_calling.py b/litellm/tests/test_function_calling.py
index 67d4fe6c9..d323325f4 100644
--- a/litellm/tests/test_function_calling.py
+++ b/litellm/tests/test_function_calling.py
@@ -45,11 +45,12 @@ def get_current_weather(location, unit="fahrenheit"):
 @pytest.mark.parametrize(
     "model",
     [
-        # "gpt-3.5-turbo-1106",
+        "gpt-3.5-turbo-1106",
         # "mistral/mistral-large-latest",
         # "claude-3-haiku-20240307",
         # "gemini/gemini-1.5-pro",
         "anthropic.claude-3-sonnet-20240229-v1:0",
+        "groq/llama3-8b-8192",
     ],
 )
 @pytest.mark.flaky(retries=3, delay=1)
@@ -154,6 +155,105 @@ def test_aaparallel_function_call(model):
 
 # test_parallel_function_call()
 
+from litellm.types.utils import ChatCompletionMessageToolCall, Function, Message
+
+
+@pytest.mark.parametrize(
+    "model, provider",
+    [
+        (
+            "anthropic.claude-3-sonnet-20240229-v1:0",
+            "bedrock",
+        ),
+        ("claude-3-haiku-20240307", "anthropic"),
+    ],
+)
+@pytest.mark.parametrize(
+    "messages, expected_error_msg",
+    [
+        (
+            [
+                {
+                    "role": "user",
+                    "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
+                },
+                Message(
+                    content="Here are the current weather conditions for San Francisco, Tokyo, and Paris:",
+                    role="assistant",
+                    tool_calls=[
+                        ChatCompletionMessageToolCall(
+                            index=1,
+                            function=Function(
+                                arguments='{"location": "San Francisco, CA", "unit": "fahrenheit"}',
+                                name="get_current_weather",
+                            ),
+                            id="tooluse_Jj98qn6xQlOP_PiQr-w9iA",
+                            type="function",
+                        )
+                    ],
+                    function_call=None,
+                ),
+                {
+                    "tool_call_id": "tooluse_Jj98qn6xQlOP_PiQr-w9iA",
+                    "role": "tool",
+                    "name": "get_current_weather",
+                    "content": '{"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}',
+                },
+            ],
+            True,
+        ),
+        (
+            [
+                {
+                    "role": "user",
+                    "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
+                }
+            ],
+            False,
+        ),
+    ],
+)
+def test_parallel_function_call_anthropic_error_msg(
+    model, provider, messages, expected_error_msg
+):
+    """
+    Anthropic doesn't support tool calling without `tools=` param specified.
+
+    Ensure this error is thrown when `tools=` param is not specified. But tool call requests are made.
+
+    Reference Issue: https://github.com/BerriAI/litellm/issues/5747, https://github.com/BerriAI/litellm/issues/5388
+    """
+    try:
+        litellm.set_verbose = True
+
+        messages = messages
+
+        if expected_error_msg:
+            with pytest.raises(litellm.UnsupportedParamsError) as e:
+                second_response = litellm.completion(
+                    model=model,
+                    messages=messages,
+                    temperature=0.2,
+                    seed=22,
+                    drop_params=True,
+                )  # get a new response from the model where it can see the function response
+                print("second response\n", second_response)
+        else:
+            second_response = litellm.completion(
+                model=model,
+                messages=messages,
+                temperature=0.2,
+                seed=22,
+                drop_params=True,
+            )  # get a new response from the model where it can see the function response
+            print("second response\n", second_response)
+    except litellm.InternalServerError as e:
+        print(e)
+    except litellm.RateLimitError as e:
+        print(e)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
 
 def test_parallel_function_call_stream():
     try:
diff --git a/litellm/tests/test_get_model_info.py b/litellm/tests/test_get_model_info.py
index 3a923bd1e..19c72ab32 100644
--- a/litellm/tests/test_get_model_info.py
+++ b/litellm/tests/test_get_model_info.py
@@ -62,3 +62,9 @@ def test_get_model_info_shows_supports_prompt_caching():
     info = litellm.get_model_info("deepseek/deepseek-chat")
     print("info", info)
     assert info.get("supports_prompt_caching") is True
+
+
+def test_get_model_info_finetuned_models():
+    info = litellm.get_model_info("ft:gpt-3.5-turbo:my-org:custom_suffix:id")
+    print("info", info)
+    assert info["input_cost_per_token"] == 0.000003
diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py
index 720abf8dd..9e5a48c53 100644
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@@ -18,13 +18,13 @@ class AnthropicMessagesTool(TypedDict, total=False):
 
 
 class AnthropicMessagesTextParam(TypedDict, total=False):
-    type: Literal["text"]
-    text: str
+    type: Required[Literal["text"]]
+    text: Required[str]
     cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
 class AnthropicMessagesToolUseParam(TypedDict):
-    type: Literal["tool_use"]
+    type: Required[Literal["tool_use"]]
     id: str
     name: str
     input: dict
@@ -58,8 +58,8 @@ class AnthropicImageParamSource(TypedDict):
 
 
 class AnthropicMessagesImageParam(TypedDict, total=False):
-    type: Literal["image"]
-    source: AnthropicImageParamSource
+    type: Required[Literal["image"]]
+    source: Required[AnthropicImageParamSource]
     cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
@@ -102,16 +102,13 @@ class AnthropicSystemMessageContent(TypedDict, total=False):
     cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
-class AnthropicMessagesRequest(TypedDict, total=False):
-    model: Required[str]
-    messages: Required[
-        List[
-            Union[
-                AnthropicMessagesUserMessageParam,
-                AnthopicMessagesAssistantMessageParam,
-            ]
-        ]
-    ]
+AllAnthropicMessageValues = Union[
+    AnthropicMessagesUserMessageParam, AnthopicMessagesAssistantMessageParam
+]
+
+
+class AnthropicMessageRequestBase(TypedDict, total=False):
+    messages: Required[List[AllAnthropicMessageValues]]
     max_tokens: Required[int]
     metadata: AnthropicMetadata
     stop_sequences: List[str]
@@ -123,6 +120,9 @@ class AnthropicMessagesRequest(TypedDict, total=False):
     top_k: int
     top_p: float
 
+
+class AnthropicMessagesRequest(AnthropicMessageRequestBase, total=False):
+    model: Required[str]
     # litellm param - used for tracking litellm proxy metadata in the request
     litellm_metadata: dict
 
@@ -291,9 +291,9 @@ class AnthropicResponse(BaseModel):
     """Billing and rate-limit usage."""
 
 
-class AnthropicChatCompletionUsageBlock(TypedDict, total=False):
-    prompt_tokens: Required[int]
-    completion_tokens: Required[int]
-    total_tokens: Required[int]
+from .openai import ChatCompletionUsageBlock
+
+
+class AnthropicChatCompletionUsageBlock(ChatCompletionUsageBlock, total=False):
     cache_creation_input_tokens: int
     cache_read_input_tokens: int
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index b73b4bc3d..ee8336699 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -343,11 +343,14 @@ class ChatCompletionImageObject(TypedDict):
     image_url: Union[str, ChatCompletionImageUrlObject]
 
 
+OpenAIMessageContent = Union[
+    str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
+]
+
+
 class OpenAIChatCompletionUserMessage(TypedDict):
     role: Literal["user"]
-    content: Union[
-        str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
-    ]
+    content: OpenAIMessageContent
 
 
 class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 3dc644030..e21a883f3 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 from openai._models import BaseModel as OpenAIObject
 from openai.types.audio.transcription_create_params import FileTypes  # type: ignore
 from openai.types.completion_usage import CompletionTokensDetails, CompletionUsage
-from pydantic import ConfigDict, Field, PrivateAttr
+from pydantic import ConfigDict, PrivateAttr
 from typing_extensions import Callable, Dict, Required, TypedDict, override
 
 from ..litellm_core_utils.core_helpers import map_finish_reason
diff --git a/litellm/utils.py b/litellm/utils.py
index fe3ef51f1..48b6f3e48 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -59,7 +59,12 @@ import litellm.litellm_core_utils.audio_utils.utils
 import litellm.litellm_core_utils.json_validation_rule
 from litellm.caching import DualCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
-from litellm.litellm_core_utils.exception_mapping_utils import get_error_message
+from litellm.litellm_core_utils.exception_mapping_utils import (
+    _get_litellm_response_headers,
+    _get_response_headers,
+    exception_type,
+    get_error_message,
+)
 from litellm.litellm_core_utils.get_llm_provider_logic import (
     _is_non_openai_azure_model,
     get_llm_provider,
@@ -246,39 +251,6 @@ def print_verbose(
         pass
 
 
-####### LOGGING ###################
-
-
-def exception_logging(
-    additional_args={},
-    logger_fn=None,
-    exception=None,
-):
-    try:
-        model_call_details = {}
-        if exception:
-            model_call_details["exception"] = exception
-        model_call_details["additional_args"] = additional_args
-        # User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
-        print_verbose(
-            f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}"
-        )
-        if logger_fn and callable(logger_fn):
-            try:
-                logger_fn(
-                    model_call_details
-                )  # Expectation: any logger function passed in by the user should accept a dict object
-            except Exception as e:
-                print_verbose(
-                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
-                )
-    except Exception as e:
-        print_verbose(
-            f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
-        )
-        pass
-
-
 ####### RULES ###################
 
 
@@ -2462,9 +2434,9 @@ def get_optional_params_transcription(
     if custom_llm_provider == "openai" or custom_llm_provider == "azure":
         optional_params = non_default_params
     elif custom_llm_provider == "groq":
-        supported_params = litellm.GroqConfig().get_supported_openai_params_stt()
+        supported_params = litellm.GroqSTTConfig().get_supported_openai_params_stt()
         _check_valid_arg(supported_params=supported_params)
-        optional_params = litellm.GroqConfig().map_openai_params_stt(
+        optional_params = litellm.GroqSTTConfig().map_openai_params_stt(
             non_default_params=non_default_params,
             optional_params=optional_params,
             model=model,
@@ -2778,6 +2750,7 @@ def get_optional_params(
     parallel_tool_calls=None,
     drop_params=None,
     additional_drop_params=None,
+    messages: Optional[List[AllMessageValues]] = None,
     **kwargs,
 ):
     # retrieve all parameters passed to the function
@@ -2857,6 +2830,7 @@ def get_optional_params(
         "parallel_tool_calls": None,
         "drop_params": None,
         "additional_drop_params": None,
+        "messages": None,
     }
 
     # filter out those parameters that were passed with non-default values
@@ -2869,6 +2843,7 @@ def get_optional_params(
             and k != "api_version"
             and k != "drop_params"
             and k != "additional_drop_params"
+            and k != "messages"
             and k in default_params
             and v != default_params[k]
             and _should_drop_param(k=k, additional_drop_params=additional_drop_params)
@@ -3033,7 +3008,9 @@ def get_optional_params(
         )
         _check_valid_arg(supported_params=supported_params)
         optional_params = litellm.AnthropicConfig().map_openai_params(
-            non_default_params=non_default_params, optional_params=optional_params
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            messages=messages,
         )
     elif custom_llm_provider == "cohere":
         ## check if unsupported param passed in
@@ -3383,6 +3360,7 @@ def get_optional_params(
                     if drop_params is not None and isinstance(drop_params, bool)
                     else False
                 ),
+                messages=messages,
             )
         elif "ai21" in model:
             _check_valid_arg(supported_params=supported_params)
@@ -4752,6 +4730,28 @@ def _strip_stable_vertex_version(model_name) -> str:
     return re.sub(r"-\d+$", "", model_name)
 
 
+def _strip_openai_finetune_model_name(model_name: str) -> str:
+    """
+    Strips the organization, custom suffix, and ID from an OpenAI fine-tuned model name.
+
+    input: ft:gpt-3.5-turbo:my-org:custom_suffix:id
+    output: ft:gpt-3.5-turbo
+
+    Args:
+    model_name (str): The full model name
+
+    Returns:
+    str: The stripped model name
+    """
+    return re.sub(r"(:[^:]+){3}$", "", model_name)
+
+
+def _strip_model_name(model: str) -> str:
+    strip_version = _strip_stable_vertex_version(model_name=model)
+    strip_finetune = _strip_openai_finetune_model_name(model_name=strip_version)
+    return strip_finetune
+
+
 def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> ModelInfo:
     """
     Get a dict for the maximum tokens (context window), input_cost_per_token, output_cost_per_token  for a given model.
@@ -4857,14 +4857,14 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
             except:
                 pass
             combined_model_name = model
-            combined_stripped_model_name = _strip_stable_vertex_version(
-                model_name=model
-            )
+            stripped_model_name = _strip_model_name(model=model)
+            combined_stripped_model_name = stripped_model_name
         else:
             split_model = model
             combined_model_name = "{}/{}".format(custom_llm_provider, model)
+            stripped_model_name = _strip_model_name(model=model)
             combined_stripped_model_name = "{}/{}".format(
-                custom_llm_provider, _strip_stable_vertex_version(model_name=model)
+                custom_llm_provider, _strip_model_name(model=model)
             )
         #########################
 
@@ -4894,8 +4894,9 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
             Check if: (in order of specificity)
             1. 'custom_llm_provider/model' in litellm.model_cost. Checks "groq/llama3-8b-8192" if model="llama3-8b-8192" and custom_llm_provider="groq"
             2. 'combined_stripped_model_name' in litellm.model_cost. Checks if 'gemini/gemini-1.5-flash' in model map, if 'gemini/gemini-1.5-flash-001' given.
-            3. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in  litellm.model_cost if model="groq/llama3-8b-8192" and custom_llm_provider=None
-            4. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
+            3. 'stripped_model_name' in litellm.model_cost. Checks if 'ft:gpt-3.5-turbo' in model map, if 'ft:gpt-3.5-turbo:my-org:custom_suffix:id' given.
+            4. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in  litellm.model_cost if model="groq/llama3-8b-8192" and custom_llm_provider=None
+            5. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
             """
             if combined_model_name in litellm.model_cost:
                 key = combined_model_name
@@ -4912,7 +4913,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                     else:
                         raise Exception
             elif combined_stripped_model_name in litellm.model_cost:
-                key = model
+                key = combined_stripped_model_name
                 _model_info = litellm.model_cost[combined_stripped_model_name]
                 _model_info["supported_openai_params"] = supported_openai_params
                 if (
@@ -4923,6 +4924,34 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                         "litellm_provider"
                     ].startswith("vertex_ai"):
                         pass
+                    elif custom_llm_provider == "fireworks_ai" and _model_info[
+                        "litellm_provider"
+                    ].startswith("fireworks_ai"):
+                        pass
+                    else:
+                        raise Exception(
+                            "Got provider={}, Expected provider={}, for model={}".format(
+                                _model_info["litellm_provider"],
+                                custom_llm_provider,
+                                model,
+                            )
+                        )
+            elif stripped_model_name in litellm.model_cost:
+                key = stripped_model_name
+                _model_info = litellm.model_cost[stripped_model_name]
+                _model_info["supported_openai_params"] = supported_openai_params
+                if (
+                    "litellm_provider" in _model_info
+                    and _model_info["litellm_provider"] != custom_llm_provider
+                ):
+                    if custom_llm_provider == "vertex_ai" and _model_info[
+                        "litellm_provider"
+                    ].startswith("vertex_ai"):
+                        pass
+                    elif custom_llm_provider == "fireworks_ai" and _model_info[
+                        "litellm_provider"
+                    ].startswith("fireworks_ai"):
+                        pass
                     else:
                         raise Exception(
                             "Got provider={}, Expected provider={}, for model={}".format(
@@ -5052,7 +5081,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                     "supports_prompt_caching", False
                 ),
             )
-    except Exception as e:
+    except Exception:
         raise Exception(
             "This model isn't mapped yet. model={}, custom_llm_provider={}. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json.".format(
                 model, custom_llm_provider
@@ -6275,1988 +6304,6 @@ def get_model_list():
         )
 
 
-####### EXCEPTION MAPPING ################
-def _get_litellm_response_headers(
-    original_exception: Exception,
-) -> Optional[httpx.Headers]:
-    """
-    Extract and return the response headers from a mapped exception, if present.
-
-    Used for accurate retry logic.
-    """
-    _response_headers: Optional[httpx.Headers] = None
-    try:
-        _response_headers = getattr(
-            original_exception, "litellm_response_headers", None
-        )
-    except Exception:
-        return None
-
-    return _response_headers
-
-
-def _get_response_headers(original_exception: Exception) -> Optional[httpx.Headers]:
-    """
-    Extract and return the response headers from an exception, if present.
-
-    Used for accurate retry logic.
-    """
-    _response_headers: Optional[httpx.Headers] = None
-    try:
-        _response_headers = getattr(original_exception, "headers", None)
-        error_response = getattr(original_exception, "response", None)
-        if _response_headers is None and error_response:
-            _response_headers = getattr(error_response, "headers", None)
-    except Exception:
-        return None
-
-    return _response_headers
-
-
-def exception_type(
-    model,
-    original_exception,
-    custom_llm_provider,
-    completion_kwargs={},
-    extra_kwargs={},
-):
-    global user_logger_fn, liteDebuggerClient
-
-    if any(
-        isinstance(original_exception, exc_type)
-        for exc_type in litellm.LITELLM_EXCEPTION_TYPES
-    ):
-        return original_exception
-    exception_mapping_worked = False
-    exception_provider = custom_llm_provider
-    if litellm.suppress_debug_info is False:
-        print()  # noqa
-        print(  # noqa
-            "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
-        )  # noqa
-        print(  # noqa
-            "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
-        )  # noqa
-        print()  # noqa
-
-    litellm_response_headers = _get_response_headers(
-        original_exception=original_exception
-    )
-    try:
-        if model:
-            if hasattr(original_exception, "message"):
-                error_str = str(original_exception.message)
-            else:
-                error_str = str(original_exception)
-            if isinstance(original_exception, BaseException):
-                exception_type = type(original_exception).__name__
-            else:
-                exception_type = ""
-
-            ################################################################################
-            # Common Extra information needed for all providers
-            # We pass num retries, api_base, vertex_deployment etc to the exception here
-            ################################################################################
-            extra_information = ""
-            try:
-                _api_base = litellm.get_api_base(
-                    model=model, optional_params=extra_kwargs
-                )
-                messages = litellm.get_first_chars_messages(kwargs=completion_kwargs)
-                _vertex_project = extra_kwargs.get("vertex_project")
-                _vertex_location = extra_kwargs.get("vertex_location")
-                _metadata = extra_kwargs.get("metadata", {}) or {}
-                _model_group = _metadata.get("model_group")
-                _deployment = _metadata.get("deployment")
-                extra_information = f"\nModel: {model}"
-
-                if (
-                    isinstance(custom_llm_provider, str)
-                    and len(custom_llm_provider) > 0
-                ):
-                    exception_provider = (
-                        custom_llm_provider[0].upper()
-                        + custom_llm_provider[1:]
-                        + "Exception"
-                    )
-
-                if _api_base:
-                    extra_information += f"\nAPI Base: `{_api_base}`"
-                if (
-                    messages
-                    and len(messages) > 0
-                    and litellm.redact_messages_in_exceptions is False
-                ):
-                    extra_information += f"\nMessages: `{messages}`"
-
-                if _model_group is not None:
-                    extra_information += f"\nmodel_group: `{_model_group}`\n"
-                if _deployment is not None:
-                    extra_information += f"\ndeployment: `{_deployment}`\n"
-                if _vertex_project is not None:
-                    extra_information += f"\nvertex_project: `{_vertex_project}`\n"
-                if _vertex_location is not None:
-                    extra_information += f"\nvertex_location: `{_vertex_location}`\n"
-
-                # on litellm proxy add key name + team to exceptions
-                extra_information = _add_key_name_and_team_to_alert(
-                    request_info=extra_information, metadata=_metadata
-                )
-            except Exception:
-                # DO NOT LET this Block raising the original exception
-                pass
-
-            ################################################################################
-            # End of Common Extra information Needed for all providers
-            ################################################################################
-
-            ################################################################################
-            #################### Start of Provider Exception mapping ####################
-            ################################################################################
-
-            if "Request Timeout Error" in error_str or "Request timed out" in error_str:
-                exception_mapping_worked = True
-                raise Timeout(
-                    message=f"APITimeoutError - Request timed out. \nerror_str: {error_str}",
-                    model=model,
-                    llm_provider=custom_llm_provider,
-                    litellm_debug_info=extra_information,
-                )
-
-            if (
-                custom_llm_provider == "openai"
-                or custom_llm_provider == "text-completion-openai"
-                or custom_llm_provider == "custom_openai"
-                or custom_llm_provider in litellm.openai_compatible_providers
-            ):
-                # custom_llm_provider is openai, make it OpenAI
-                message = get_error_message(error_obj=original_exception)
-                if message is None:
-                    if hasattr(original_exception, "message"):
-                        message = original_exception.message
-                    else:
-                        message = str(original_exception)
-
-                if message is not None and isinstance(
-                    message, str
-                ):  # done to prevent user-confusion. Relevant issue - https://github.com/BerriAI/litellm/issues/1414
-                    message = message.replace("OPENAI", custom_llm_provider.upper())
-                    message = message.replace(
-                        "openai.OpenAIError",
-                        "{}.{}Error".format(custom_llm_provider, custom_llm_provider),
-                    )
-                if custom_llm_provider == "openai":
-                    exception_provider = "OpenAI" + "Exception"
-                else:
-                    exception_provider = (
-                        custom_llm_provider[0].upper()
-                        + custom_llm_provider[1:]
-                        + "Exception"
-                    )
-
-                if (
-                    "This model's maximum context length is" in error_str
-                    or "string too long. Expected a string with maximum length"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"ContextWindowExceededError: {exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "invalid_request_error" in error_str
-                    and "model_not_found" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise NotFoundError(
-                        message=f"{exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "A timeout occurred" in error_str:
-                    exception_mapping_worked = True
-                    raise Timeout(
-                        message=f"{exception_provider} - {message}",
-                        model=model,
-                        llm_provider=custom_llm_provider,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "invalid_request_error" in error_str
-                    and "content_policy_violation" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContentPolicyViolationError(
-                        message=f"ContentPolicyViolationError: {exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "invalid_request_error" in error_str
-                    and "Incorrect API key provided" not in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"{exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "Web server is returning an unknown error" in error_str:
-                    exception_mapping_worked = True
-                    raise litellm.InternalServerError(
-                        message=f"{exception_provider} - {message}",
-                        model=model,
-                        llm_provider=custom_llm_provider,
-                    )
-                elif "Request too large" in error_str:
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"RateLimitError: {exception_provider} - {message}",
-                        model=model,
-                        llm_provider=custom_llm_provider,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"AuthenticationError: {exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "Mistral API raised a streaming error" in error_str:
-                    exception_mapping_worked = True
-                    _request = httpx.Request(
-                        method="POST", url="https://api.openai.com/v1"
-                    )
-                    raise APIError(
-                        status_code=500,
-                        message=f"{exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        request=_request,
-                        litellm_debug_info=extra_information,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    exception_mapping_worked = True
-                    if original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"{exception_provider} - {message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AuthenticationError: {exception_provider} - {message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"NotFoundError: {exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"Timeout Error: {exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"{exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=getattr(original_exception, "response", None),
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"RateLimitError: {exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=getattr(original_exception, "response", None),
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"ServiceUnavailableError: {exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=getattr(original_exception, "response", None),
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 504:  # gateway timeout error
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"Timeout Error: {exception_provider} - {message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"APIError: {exception_provider} - {message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            request=getattr(original_exception, "request", None),
-                            litellm_debug_info=extra_information,
-                        )
-                else:
-                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
-                    # exception_mapping_worked = True
-                    raise APIConnectionError(
-                        message=f"APIConnectionError: {exception_provider} - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        litellm_debug_info=extra_information,
-                        request=httpx.Request(
-                            method="POST", url="https://api.openai.com/v1/"
-                        ),
-                    )
-            elif custom_llm_provider == "anthropic":  # one of the anthropics
-                if "prompt is too long" in error_str or "prompt: length" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message="AnthropicError - {}".format(error_str),
-                        model=model,
-                        llm_provider="anthropic",
-                    )
-                if "Invalid API Key" in error_str:
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message="AnthropicError - {}".format(error_str),
-                        model=model,
-                        llm_provider="anthropic",
-                    )
-                if "content filtering policy" in error_str:
-                    exception_mapping_worked = True
-                    raise ContentPolicyViolationError(
-                        message="AnthropicError - {}".format(error_str),
-                        model=model,
-                        llm_provider="anthropic",
-                    )
-                if "Client error '400 Bad Request'" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message="AnthropicError - {}".format(error_str),
-                        model=model,
-                        llm_provider="anthropic",
-                    )
-                if hasattr(original_exception, "status_code"):
-                    print_verbose(f"status_code: {original_exception.status_code}")
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AnthropicException - {error_str}",
-                            llm_provider="anthropic",
-                            model=model,
-                        )
-                    elif (
-                        original_exception.status_code == 400
-                        or original_exception.status_code == 413
-                    ):
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AnthropicException - {error_str}",
-                            model=model,
-                            llm_provider="anthropic",
-                        )
-                    elif original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"AnthropicException - {error_str}",
-                            model=model,
-                            llm_provider="anthropic",
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"AnthropicException - {error_str}",
-                            model=model,
-                            llm_provider="anthropic",
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"AnthropicException - {error_str}",
-                            llm_provider="anthropic",
-                            model=model,
-                        )
-                    elif (
-                        original_exception.status_code == 500
-                        or original_exception.status_code == 529
-                    ):
-                        exception_mapping_worked = True
-                        raise litellm.InternalServerError(
-                            message=f"AnthropicException - {error_str}. Handle with `litellm.InternalServerError`.",
-                            llm_provider="anthropic",
-                            model=model,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise litellm.ServiceUnavailableError(
-                            message=f"AnthropicException - {error_str}. Handle with `litellm.ServiceUnavailableError`.",
-                            llm_provider="anthropic",
-                            model=model,
-                        )
-            elif custom_llm_provider == "replicate":
-                if "Incorrect authentication token" in error_str:
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"ReplicateException - {error_str}",
-                        llm_provider="replicate",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "input is too long" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"ReplicateException - {error_str}",
-                        model=model,
-                        llm_provider="replicate",
-                        response=original_exception.response,
-                    )
-                elif exception_type == "ModelError":
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"ReplicateException - {error_str}",
-                        model=model,
-                        llm_provider="replicate",
-                        response=original_exception.response,
-                    )
-                elif "Request was throttled" in error_str:
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"ReplicateException - {error_str}",
-                        llm_provider="replicate",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            llm_provider="replicate",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif (
-                        original_exception.status_code == 400
-                        or original_exception.status_code == 413
-                    ):
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            model=model,
-                            llm_provider="replicate",
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise UnprocessableEntityError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            model=model,
-                            llm_provider="replicate",
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"ReplicateException - {original_exception.message}",
-                            model=model,
-                            llm_provider="replicate",
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise UnprocessableEntityError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            llm_provider="replicate",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            llm_provider="replicate",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"ReplicateException - {original_exception.message}",
-                            llm_provider="replicate",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                exception_mapping_worked = True
-                raise APIError(
-                    status_code=500,
-                    message=f"ReplicateException - {str(original_exception)}",
-                    llm_provider="replicate",
-                    model=model,
-                    request=httpx.Request(
-                        method="POST",
-                        url="https://api.replicate.com/v1/deployments",
-                    ),
-                )
-            elif custom_llm_provider == "watsonx":
-                if "token_quota_reached" in error_str:
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"WatsonxException: Rate Limit Errror - {error_str}",
-                        llm_provider="watsonx",
-                        model=model,
-                        response=original_exception.response,
-                    )
-            elif (
-                custom_llm_provider == "predibase"
-                or custom_llm_provider == "databricks"
-            ):
-                if "authorization denied for" in error_str:
-                    exception_mapping_worked = True
-
-                    # Predibase returns the raw API Key in the response - this block ensures it's not returned in the exception
-                    if (
-                        error_str is not None
-                        and isinstance(error_str, str)
-                        and "bearer" in error_str.lower()
-                    ):
-                        # only keep the first 10 chars after the occurnence of "bearer"
-                        _bearer_token_start_index = error_str.lower().find("bearer")
-                        error_str = error_str[: _bearer_token_start_index + 14]
-                        error_str += "XXXXXXX" + '"'
-
-                    raise AuthenticationError(
-                        message=f"{custom_llm_provider}Exception: Authentication Error - {error_str}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        response=original_exception.response,
-                        litellm_debug_info=extra_information,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise litellm.InternalServerError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    elif (
-                        original_exception.status_code == 401
-                        or original_exception.status_code == 403
-                    ):
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    elif original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    elif original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif (
-                        original_exception.status_code == 422
-                        or original_exception.status_code == 424
-                    ):
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 504:  # gateway timeout error
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"{custom_llm_provider}Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-            elif custom_llm_provider == "bedrock":
-                if (
-                    "too many tokens" in error_str
-                    or "expected maxLength:" in error_str
-                    or "Input is too long" in error_str
-                    or "prompt: length: 1.." in error_str
-                    or "Too many input tokens" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"BedrockException: Context Window Error - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                    )
-                elif "Malformed input request" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"BedrockException - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                        response=original_exception.response,
-                    )
-                elif "A conversation must start with a user message." in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"BedrockException - {error_str}\n. Pass in default user message via `completion(..,user_continue_message=)` or enable `litellm.modify_params=True`.\nFor Proxy: do via `litellm_settings::modify_params: True` or user_continue_message under `litellm_params`",
-                        model=model,
-                        llm_provider="bedrock",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "Unable to locate credentials" in error_str
-                    or "The security token included in the request is invalid"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"BedrockException Invalid Authentication - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                        response=original_exception.response,
-                    )
-                elif "AccessDeniedException" in error_str:
-                    exception_mapping_worked = True
-                    raise PermissionDeniedError(
-                        message=f"BedrockException PermissionDeniedError - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "throttlingException" in error_str
-                    or "ThrottlingException" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"BedrockException: Rate Limit Error - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "Connect timeout on endpoint URL" in error_str
-                    or "timed out" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise Timeout(
-                        message=f"BedrockException: Timeout Error - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                    )
-                elif "Could not process image" in error_str:
-                    exception_mapping_worked = True
-                    raise litellm.InternalServerError(
-                        message=f"BedrockException - {error_str}",
-                        model=model,
-                        llm_provider="bedrock",
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"BedrockException - {original_exception.message}",
-                            llm_provider="bedrock",
-                            model=model,
-                            response=httpx.Response(
-                                status_code=500,
-                                request=httpx.Request(
-                                    method="POST", url="https://api.openai.com/v1/"
-                                ),
-                            ),
-                        )
-                    elif original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"BedrockException - {original_exception.message}",
-                            llm_provider="bedrock",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"BedrockException - {original_exception.message}",
-                            llm_provider="bedrock",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"BedrockException - {original_exception.message}",
-                            llm_provider="bedrock",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"BedrockException - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"BedrockException - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"BedrockException - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"BedrockException - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 504:  # gateway timeout error
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"BedrockException - {original_exception.message}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-            elif custom_llm_provider == "sagemaker":
-                if "Unable to locate credentials" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"litellm.BadRequestError: SagemakerException - {error_str}",
-                        model=model,
-                        llm_provider="sagemaker",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "Input validation error: `best_of` must be > 0 and <= 2"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints",
-                        model=model,
-                        llm_provider="sagemaker",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "`inputs` tokens + `max_new_tokens` must be <=" in error_str
-                    or "instance type with more CPU capacity or memory" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"SagemakerException - {error_str}",
-                        model=model,
-                        llm_provider="sagemaker",
-                        response=original_exception.response,
-                    )
-            elif (
-                custom_llm_provider == "vertex_ai"
-                or custom_llm_provider == "vertex_ai_beta"
-                or custom_llm_provider == "gemini"
-            ):
-                if (
-                    "Vertex AI API has not been used in project" in error_str
-                    or "Unable to find your project" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"litellm.BadRequestError: VertexAIException - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        response=httpx.Response(
-                            status_code=400,
-                            request=httpx.Request(
-                                method="POST",
-                                url=" https://cloud.google.com/vertex-ai/",
-                            ),
-                        ),
-                        litellm_debug_info=extra_information,
-                    )
-                if "400 Request payload size exceeds" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"VertexException - {error_str}",
-                        model=model,
-                        llm_provider=custom_llm_provider,
-                    )
-                elif (
-                    "None Unknown Error." in error_str
-                    or "Content has no parts." in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise litellm.InternalServerError(
-                        message=f"litellm.InternalServerError: VertexAIException - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        response=httpx.Response(
-                            status_code=500,
-                            content=str(original_exception),
-                            request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"),  # type: ignore
-                        ),
-                        litellm_debug_info=extra_information,
-                    )
-                elif "API key not valid." in error_str:
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"{custom_llm_provider}Exception - {error_str}",
-                        model=model,
-                        llm_provider=custom_llm_provider,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "403" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"VertexAIException BadRequestError - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        response=httpx.Response(
-                            status_code=403,
-                            request=httpx.Request(
-                                method="POST",
-                                url=" https://cloud.google.com/vertex-ai/",
-                            ),
-                        ),
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "The response was blocked." in error_str
-                    or "Output blocked by content filtering policy"
-                    in error_str  # anthropic on vertex ai
-                ):
-                    exception_mapping_worked = True
-                    raise ContentPolicyViolationError(
-                        message=f"VertexAIException ContentPolicyViolationError - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        litellm_debug_info=extra_information,
-                        response=httpx.Response(
-                            status_code=400,
-                            request=httpx.Request(
-                                method="POST",
-                                url=" https://cloud.google.com/vertex-ai/",
-                            ),
-                        ),
-                    )
-                elif (
-                    "429 Quota exceeded" in error_str
-                    or "Quota exceeded for" in error_str
-                    or "IndexError: list index out of range" in error_str
-                    or "429 Unable to submit request because the service is temporarily out of capacity."
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"litellm.RateLimitError: VertexAIException - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        litellm_debug_info=extra_information,
-                        response=httpx.Response(
-                            status_code=429,
-                            request=httpx.Request(
-                                method="POST",
-                                url=" https://cloud.google.com/vertex-ai/",
-                            ),
-                        ),
-                    )
-                elif "500 Internal Server Error" in error_str:
-                    exception_mapping_worked = True
-                    raise ServiceUnavailableError(
-                        message=f"litellm.ServiceUnavailableError: VertexAIException - {error_str}",
-                        model=model,
-                        llm_provider="vertex_ai",
-                        litellm_debug_info=extra_information,
-                    )
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"VertexAIException BadRequestError - {error_str}",
-                            model=model,
-                            llm_provider="vertex_ai",
-                            litellm_debug_info=extra_information,
-                            response=httpx.Response(
-                                status_code=400,
-                                request=httpx.Request(
-                                    method="POST",
-                                    url="https://cloud.google.com/vertex-ai/",
-                                ),
-                            ),
-                        )
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"VertexAIException - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    if original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"VertexAIException - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-                    if original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"VertexAIException - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-
-                    if original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"litellm.RateLimitError: VertexAIException - {error_str}",
-                            model=model,
-                            llm_provider="vertex_ai",
-                            litellm_debug_info=extra_information,
-                            response=httpx.Response(
-                                status_code=429,
-                                request=httpx.Request(
-                                    method="POST",
-                                    url=" https://cloud.google.com/vertex-ai/",
-                                ),
-                            ),
-                        )
-                    if original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise litellm.InternalServerError(
-                            message=f"VertexAIException InternalServerError - {error_str}",
-                            model=model,
-                            llm_provider="vertex_ai",
-                            litellm_debug_info=extra_information,
-                            response=httpx.Response(
-                                status_code=500,
-                                content=str(original_exception),
-                                request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"),  # type: ignore
-                            ),
-                        )
-                    if original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"VertexAIException - {original_exception.message}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                        )
-            elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
-                if "503 Getting metadata" in error_str:
-                    # auth errors look like this
-                    # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"GeminiException - Invalid api key",
-                        model=model,
-                        llm_provider="palm",
-                        response=original_exception.response,
-                    )
-                if (
-                    "504 Deadline expired before operation could complete." in error_str
-                    or "504 Deadline Exceeded" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise Timeout(
-                        message=f"GeminiException - {original_exception.message}",
-                        model=model,
-                        llm_provider="palm",
-                    )
-                if "400 Request payload size exceeds" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"GeminiException - {error_str}",
-                        model=model,
-                        llm_provider="palm",
-                        response=original_exception.response,
-                    )
-                if (
-                    "500 An internal error has occurred." in error_str
-                    or "list index out of range" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise APIError(
-                        status_code=getattr(original_exception, "status_code", 500),
-                        message=f"GeminiException - {original_exception.message}",
-                        llm_provider="palm",
-                        model=model,
-                        request=httpx.Response(
-                            status_code=429,
-                            request=httpx.Request(
-                                method="POST",
-                                url=" https://cloud.google.com/vertex-ai/",
-                            ),
-                        ),
-                    )
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"GeminiException - {error_str}",
-                            model=model,
-                            llm_provider="palm",
-                            response=original_exception.response,
-                        )
-                # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
-            elif custom_llm_provider == "cloudflare":
-                if "Authentication error" in error_str:
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"Cloudflare Exception - {original_exception.message}",
-                        llm_provider="cloudflare",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                if "must have required property" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"Cloudflare Exception - {original_exception.message}",
-                        llm_provider="cloudflare",
-                        model=model,
-                        response=original_exception.response,
-                    )
-            elif (
-                custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat"
-            ):  # Cohere
-                if (
-                    "invalid api token" in error_str
-                    or "No API key provided." in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"CohereException - {original_exception.message}",
-                        llm_provider="cohere",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "too many tokens" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"CohereException - {original_exception.message}",
-                        model=model,
-                        llm_provider="cohere",
-                        response=original_exception.response,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    if (
-                        original_exception.status_code == 400
-                        or original_exception.status_code == 498
-                    ):
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"CohereException - {original_exception.message}",
-                            llm_provider="cohere",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"CohereException - {original_exception.message}",
-                            llm_provider="cohere",
-                            model=model,
-                        )
-                    elif original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"CohereException - {original_exception.message}",
-                            llm_provider="cohere",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                elif (
-                    "CohereConnectionError" in exception_type
-                ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=f"CohereException - {original_exception.message}",
-                        llm_provider="cohere",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "invalid type:" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"CohereException - {original_exception.message}",
-                        llm_provider="cohere",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "Unexpected server error" in error_str:
-                    exception_mapping_worked = True
-                    raise ServiceUnavailableError(
-                        message=f"CohereException - {original_exception.message}",
-                        llm_provider="cohere",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                else:
-                    if hasattr(original_exception, "status_code"):
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"CohereException - {original_exception.message}",
-                            llm_provider="cohere",
-                            model=model,
-                            request=original_exception.request,
-                        )
-                    raise original_exception
-            elif custom_llm_provider == "huggingface":
-                if "length limit exceeded" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=error_str,
-                        model=model,
-                        llm_provider="huggingface",
-                        response=original_exception.response,
-                    )
-                elif "A valid user token is required" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=error_str,
-                        llm_provider="huggingface",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "Rate limit reached" in error_str:
-                    exception_mapping_worked = True
-                    raise RateLimitError(
-                        message=error_str,
-                        llm_provider="huggingface",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            llm_provider="huggingface",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            model=model,
-                            llm_provider="huggingface",
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            model=model,
-                            llm_provider="huggingface",
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            llm_provider="huggingface",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            llm_provider="huggingface",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"HuggingfaceException - {original_exception.message}",
-                            llm_provider="huggingface",
-                            model=model,
-                            request=original_exception.request,
-                        )
-            elif custom_llm_provider == "ai21":
-                if hasattr(original_exception, "message"):
-                    if "Prompt has too many tokens" in original_exception.message:
-                        exception_mapping_worked = True
-                        raise ContextWindowExceededError(
-                            message=f"AI21Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider="ai21",
-                            response=original_exception.response,
-                        )
-                    if "Bad or missing API token." in original_exception.message:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AI21Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider="ai21",
-                            response=original_exception.response,
-                        )
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AI21Exception - {original_exception.message}",
-                            llm_provider="ai21",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"AI21Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider="ai21",
-                        )
-                    if original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AI21Exception - {original_exception.message}",
-                            model=model,
-                            llm_provider="ai21",
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"AI21Exception - {original_exception.message}",
-                            llm_provider="ai21",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"AI21Exception - {original_exception.message}",
-                            llm_provider="ai21",
-                            model=model,
-                            request=original_exception.request,
-                        )
-            elif custom_llm_provider == "nlp_cloud":
-                if "detail" in error_str:
-                    if "Input text length should not exceed" in error_str:
-                        exception_mapping_worked = True
-                        raise ContextWindowExceededError(
-                            message=f"NLPCloudException - {error_str}",
-                            model=model,
-                            llm_provider="nlp_cloud",
-                            response=original_exception.response,
-                        )
-                    elif "value is not a valid" in error_str:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"NLPCloudException - {error_str}",
-                            model=model,
-                            llm_provider="nlp_cloud",
-                            response=original_exception.response,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=500,
-                            message=f"NLPCloudException - {error_str}",
-                            model=model,
-                            llm_provider="nlp_cloud",
-                            request=original_exception.request,
-                        )
-                if hasattr(
-                    original_exception, "status_code"
-                ):  # https://docs.nlpcloud.com/?shell#errors
-                    if (
-                        original_exception.status_code == 400
-                        or original_exception.status_code == 406
-                        or original_exception.status_code == 413
-                        or original_exception.status_code == 422
-                    ):
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"NLPCloudException - {original_exception.message}",
-                            llm_provider="nlp_cloud",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif (
-                        original_exception.status_code == 401
-                        or original_exception.status_code == 403
-                    ):
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"NLPCloudException - {original_exception.message}",
-                            llm_provider="nlp_cloud",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif (
-                        original_exception.status_code == 522
-                        or original_exception.status_code == 524
-                    ):
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"NLPCloudException - {original_exception.message}",
-                            model=model,
-                            llm_provider="nlp_cloud",
-                        )
-                    elif (
-                        original_exception.status_code == 429
-                        or original_exception.status_code == 402
-                    ):
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"NLPCloudException - {original_exception.message}",
-                            llm_provider="nlp_cloud",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif (
-                        original_exception.status_code == 500
-                        or original_exception.status_code == 503
-                    ):
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"NLPCloudException - {original_exception.message}",
-                            llm_provider="nlp_cloud",
-                            model=model,
-                            request=original_exception.request,
-                        )
-                    elif (
-                        original_exception.status_code == 504
-                        or original_exception.status_code == 520
-                    ):
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"NLPCloudException - {original_exception.message}",
-                            model=model,
-                            llm_provider="nlp_cloud",
-                            response=original_exception.response,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"NLPCloudException - {original_exception.message}",
-                            llm_provider="nlp_cloud",
-                            model=model,
-                            request=original_exception.request,
-                        )
-            elif custom_llm_provider == "together_ai":
-                import json
-
-                try:
-                    error_response = json.loads(error_str)
-                except:
-                    error_response = {"error": error_str}
-                if (
-                    "error" in error_response
-                    and "`inputs` tokens + `max_new_tokens` must be <="
-                    in error_response["error"]
-                ):
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"TogetherAIException - {error_response['error']}",
-                        model=model,
-                        llm_provider="together_ai",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "error" in error_response
-                    and "invalid private key" in error_response["error"]
-                ):
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"TogetherAIException - {error_response['error']}",
-                        llm_provider="together_ai",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif (
-                    "error" in error_response
-                    and "INVALID_ARGUMENT" in error_response["error"]
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"TogetherAIException - {error_response['error']}",
-                        model=model,
-                        llm_provider="together_ai",
-                        response=original_exception.response,
-                    )
-                elif "A timeout occurred" in error_str:
-                    exception_mapping_worked = True
-                    raise Timeout(
-                        message=f"TogetherAIException - {error_str}",
-                        model=model,
-                        llm_provider="together_ai",
-                    )
-                elif (
-                    "error" in error_response
-                    and "API key doesn't match expected format."
-                    in error_response["error"]
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"TogetherAIException - {error_response['error']}",
-                        model=model,
-                        llm_provider="together_ai",
-                        response=original_exception.response,
-                    )
-                elif (
-                    "error_type" in error_response
-                    and error_response["error_type"] == "validation"
-                ):
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"TogetherAIException - {error_response['error']}",
-                        model=model,
-                        llm_provider="together_ai",
-                        response=original_exception.response,
-                    )
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"TogetherAIException - {original_exception.message}",
-                            model=model,
-                            llm_provider="together_ai",
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"TogetherAIException - {error_response['error']}",
-                            model=model,
-                            llm_provider="together_ai",
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"TogetherAIException - {original_exception.message}",
-                            llm_provider="together_ai",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 524:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"TogetherAIException - {original_exception.message}",
-                            llm_provider="together_ai",
-                            model=model,
-                        )
-                else:
-                    exception_mapping_worked = True
-                    raise APIError(
-                        status_code=original_exception.status_code,
-                        message=f"TogetherAIException - {original_exception.message}",
-                        llm_provider="together_ai",
-                        model=model,
-                        request=original_exception.request,
-                    )
-            elif custom_llm_provider == "aleph_alpha":
-                if (
-                    "This is longer than the model's maximum context length"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"AlephAlphaException - {original_exception.message}",
-                        llm_provider="aleph_alpha",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "InvalidToken" in error_str or "No token provided" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"AlephAlphaException - {original_exception.message}",
-                        llm_provider="aleph_alpha",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    print_verbose(f"status code: {original_exception.status_code}")
-                    if original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AlephAlphaException - {original_exception.message}",
-                            llm_provider="aleph_alpha",
-                            model=model,
-                        )
-                    elif original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AlephAlphaException - {original_exception.message}",
-                            llm_provider="aleph_alpha",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"AlephAlphaException - {original_exception.message}",
-                            llm_provider="aleph_alpha",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    elif original_exception.status_code == 500:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"AlephAlphaException - {original_exception.message}",
-                            llm_provider="aleph_alpha",
-                            model=model,
-                            response=original_exception.response,
-                        )
-                    raise original_exception
-                raise original_exception
-            elif (
-                custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat"
-            ):
-                if isinstance(original_exception, dict):
-                    error_str = original_exception.get("error", "")
-                else:
-                    error_str = str(original_exception)
-                if "no such file or directory" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
-                        model=model,
-                        llm_provider="ollama",
-                        response=original_exception.response,
-                    )
-                elif "Failed to establish a new connection" in error_str:
-                    exception_mapping_worked = True
-                    raise ServiceUnavailableError(
-                        message=f"OllamaException: {original_exception}",
-                        llm_provider="ollama",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "Invalid response object from API" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"OllamaException: {original_exception}",
-                        llm_provider="ollama",
-                        model=model,
-                        response=original_exception.response,
-                    )
-                elif "Read timed out" in error_str:
-                    exception_mapping_worked = True
-                    raise Timeout(
-                        message=f"OllamaException: {original_exception}",
-                        llm_provider="ollama",
-                        model=model,
-                    )
-            elif custom_llm_provider == "vllm":
-                if hasattr(original_exception, "status_code"):
-                    if original_exception.status_code == 0:
-                        exception_mapping_worked = True
-                        raise APIConnectionError(
-                            message=f"VLLMException - {original_exception.message}",
-                            llm_provider="vllm",
-                            model=model,
-                            request=original_exception.request,
-                        )
-            elif custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
-                message = get_error_message(error_obj=original_exception)
-                if message is None:
-                    if hasattr(original_exception, "message"):
-                        message = original_exception.message
-                    else:
-                        message = str(original_exception)
-
-                if "Internal server error" in error_str:
-                    exception_mapping_worked = True
-                    raise litellm.InternalServerError(
-                        message=f"AzureException Internal server error - {message}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "This model's maximum context length is" in error_str:
-                    exception_mapping_worked = True
-                    raise ContextWindowExceededError(
-                        message=f"AzureException ContextWindowExceededError - {message}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "DeploymentNotFound" in error_str:
-                    exception_mapping_worked = True
-                    raise NotFoundError(
-                        message=f"AzureException NotFoundError - {message}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    (
-                        "invalid_request_error" in error_str
-                        and "content_policy_violation" in error_str
-                    )
-                    or (
-                        "The response was filtered due to the prompt triggering Azure OpenAI's content management"
-                        in error_str
-                    )
-                    or "Your task failed as a result of our safety system" in error_str
-                    or "The model produced invalid content" in error_str
-                    or "content_filter_policy" in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise ContentPolicyViolationError(
-                        message=f"litellm.ContentPolicyViolationError: AzureException - {message}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "invalid_request_error" in error_str:
-                    exception_mapping_worked = True
-                    raise BadRequestError(
-                        message=f"AzureException BadRequestError - {message}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif (
-                    "The api_key client option must be set either by passing api_key to the client or by setting"
-                    in error_str
-                ):
-                    exception_mapping_worked = True
-                    raise AuthenticationError(
-                        message=f"{exception_provider} AuthenticationError - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif "Connection error" in error_str:
-                    exception_mapping_worked = True
-                    raise APIConnectionError(
-                        message=f"{exception_provider} APIConnectionError - {message}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        litellm_debug_info=extra_information,
-                    )
-                elif hasattr(original_exception, "status_code"):
-                    exception_mapping_worked = True
-                    if original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AzureException - {message}",
-                            llm_provider="azure",
-                            model=model,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AzureException AuthenticationError - {message}",
-                            llm_provider="azure",
-                            model=model,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"AzureException Timeout - {message}",
-                            model=model,
-                            litellm_debug_info=extra_information,
-                            llm_provider="azure",
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"AzureException BadRequestError - {message}",
-                            model=model,
-                            llm_provider="azure",
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"AzureException RateLimitError - {message}",
-                            model=model,
-                            llm_provider="azure",
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"AzureException ServiceUnavailableError - {message}",
-                            model=model,
-                            llm_provider="azure",
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 504:  # gateway timeout error
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"AzureException Timeout - {message}",
-                            model=model,
-                            litellm_debug_info=extra_information,
-                            llm_provider="azure",
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"AzureException APIError - {message}",
-                            llm_provider="azure",
-                            litellm_debug_info=extra_information,
-                            model=model,
-                            request=httpx.Request(
-                                method="POST", url="https://openai.com/"
-                            ),
-                        )
-                else:
-                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
-                    raise APIConnectionError(
-                        message=f"{exception_provider} APIConnectionError - {message}\n{traceback.format_exc()}",
-                        llm_provider="azure",
-                        model=model,
-                        litellm_debug_info=extra_information,
-                        request=httpx.Request(method="POST", url="https://openai.com/"),
-                    )
-            if custom_llm_provider == "openrouter":
-                if hasattr(original_exception, "status_code"):
-                    exception_mapping_worked = True
-                    if original_exception.status_code == 400:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"{exception_provider} - {error_str}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 401:
-                        exception_mapping_worked = True
-                        raise AuthenticationError(
-                            message=f"AuthenticationError: {exception_provider} - {error_str}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 404:
-                        exception_mapping_worked = True
-                        raise NotFoundError(
-                            message=f"NotFoundError: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 408:
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"Timeout Error: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 422:
-                        exception_mapping_worked = True
-                        raise BadRequestError(
-                            message=f"BadRequestError: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 429:
-                        exception_mapping_worked = True
-                        raise RateLimitError(
-                            message=f"RateLimitError: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 503:
-                        exception_mapping_worked = True
-                        raise ServiceUnavailableError(
-                            message=f"ServiceUnavailableError: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            response=original_exception.response,
-                            litellm_debug_info=extra_information,
-                        )
-                    elif original_exception.status_code == 504:  # gateway timeout error
-                        exception_mapping_worked = True
-                        raise Timeout(
-                            message=f"Timeout Error: {exception_provider} - {error_str}",
-                            model=model,
-                            llm_provider=custom_llm_provider,
-                            litellm_debug_info=extra_information,
-                        )
-                    else:
-                        exception_mapping_worked = True
-                        raise APIError(
-                            status_code=original_exception.status_code,
-                            message=f"APIError: {exception_provider} - {error_str}",
-                            llm_provider=custom_llm_provider,
-                            model=model,
-                            request=original_exception.request,
-                            litellm_debug_info=extra_information,
-                        )
-                else:
-                    # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
-                    raise APIConnectionError(
-                        message=f"APIConnectionError: {exception_provider} - {error_str}",
-                        llm_provider=custom_llm_provider,
-                        model=model,
-                        litellm_debug_info=extra_information,
-                        request=httpx.Request(
-                            method="POST", url="https://api.openai.com/v1/"
-                        ),
-                    )
-        if (
-            "BadRequestError.__init__() missing 1 required positional argument: 'param'"
-            in str(original_exception)
-        ):  # deal with edge-case invalid request error bug in openai-python sdk
-            exception_mapping_worked = True
-            raise BadRequestError(
-                message=f"{exception_provider} BadRequestError : This can happen due to missing AZURE_API_VERSION: {str(original_exception)}",
-                model=model,
-                llm_provider=custom_llm_provider,
-                response=getattr(original_exception, "response", None),
-            )
-        else:  # ensure generic errors always return APIConnectionError=
-            """
-            For unmapped exceptions - raise the exception with traceback - https://github.com/BerriAI/litellm/issues/4201
-            """
-            exception_mapping_worked = True
-            if hasattr(original_exception, "request"):
-                raise APIConnectionError(
-                    message="{} - {}".format(exception_provider, error_str),
-                    llm_provider=custom_llm_provider,
-                    model=model,
-                    request=original_exception.request,
-                )
-            else:
-                raise APIConnectionError(
-                    message="{}\n{}".format(
-                        str(original_exception), traceback.format_exc()
-                    ),
-                    llm_provider=custom_llm_provider,
-                    model=model,
-                    request=httpx.Request(
-                        method="POST", url="https://api.openai.com/v1/"
-                    ),  # stub the request
-                )
-    except Exception as e:
-        # LOGGING
-        exception_logging(
-            logger_fn=user_logger_fn,
-            additional_args={
-                "exception_mapping_worked": exception_mapping_worked,
-                "original_exception": original_exception,
-            },
-            exception=e,
-        )
-        ## AUTH ERROR
-        if isinstance(e, AuthenticationError) and (
-            litellm.email or "LITELLM_EMAIL" in os.environ
-        ):
-            threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
-        # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
-        if exception_mapping_worked:
-            setattr(e, "litellm_response_headers", litellm_response_headers)
-            raise e
-        else:
-            for error_type in litellm.LITELLM_EXCEPTION_TYPES:
-                if isinstance(e, error_type):
-                    setattr(e, "litellm_response_headers", litellm_response_headers)
-                    raise e  # it's already mapped
-            raised_exc = APIConnectionError(
-                message="{}\n{}".format(original_exception, traceback.format_exc()),
-                llm_provider="",
-                model="",
-            )
-            setattr(raised_exc, "litellm_response_headers", _response_headers)
-            raise raised_exc
-
-
 ######## Streaming Class ############################
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
@@ -11166,3 +9213,15 @@ def is_base64_encoded(s: str) -> bool:
         return base64.b64encode(decoded_bytes).decode("utf-8") == s
     except Exception:
         return False
+
+
+def has_tool_call_blocks(messages: List[AllMessageValues]) -> bool:
+    """
+    Returns true, if messages has tool call blocks.
+
+    Used for anthropic/bedrock message validation.
+    """
+    for message in messages:
+        if message.get("tool_calls") is not None:
+            return True
+    return False
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 48b2a9322..e698fc5ba 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1173,6 +1173,18 @@
         "supports_function_calling": true,
         "supports_assistant_prefill": true
     },
+    "mistral/pixtral-12b-2409": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000015,
+        "output_cost_per_token": 0.00000015,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_assistant_prefill": true,
+        "supports_vision": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
diff --git a/litellm/tests/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
similarity index 98%
rename from litellm/tests/test_anthropic_completion.py
rename to tests/llm_translation/test_anthropic_completion.py
index b8ccf716e..2d5dd570a 100644
--- a/litellm/tests/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -25,7 +25,12 @@ from unittest.mock import MagicMock, patch
 import pytest
 
 import litellm
-from litellm import AnthropicConfig, Router, adapter_completion
+from litellm import (
+    AnthropicConfig,
+    Router,
+    adapter_completion,
+    AnthropicExperimentalPassThroughConfig,
+)
 from litellm.adapters.anthropic_adapter import anthropic_adapter
 from litellm.types.llms.anthropic import AnthropicResponse
 
@@ -33,7 +38,7 @@ from litellm.types.llms.anthropic import AnthropicResponse
 def test_anthropic_completion_messages_translation():
     messages = [{"role": "user", "content": "Hey, how's it going?"}]
 
-    translated_messages = AnthropicConfig().translate_anthropic_messages_to_openai(messages=messages)  # type: ignore
+    translated_messages = AnthropicExperimentalPassThroughConfig().translate_anthropic_messages_to_openai(messages=messages)  # type: ignore
 
     assert translated_messages == [{"role": "user", "content": "Hey, how's it going?"}]
 
diff --git a/tests/llm_translation/test_databricks.py b/tests/llm_translation/test_databricks.py
index b3bd92d8d..d5cd1135c 100644
--- a/tests/llm_translation/test_databricks.py
+++ b/tests/llm_translation/test_databricks.py
@@ -5,7 +5,11 @@ import pytest
 import sys
 from typing import Any, Dict, List
 from unittest.mock import MagicMock, Mock, patch
+import os
 
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm.exceptions import BadRequestError
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler