Merge branch 'main' into litellm_fix_httpx_transport

2025-04-25 10:44:24 +00:00 · 2024-07-02 17:17:43 -07:00 · 2024-07-02 17:17:43 -07:00 · d38f01e956
commit d38f01e956
parent 98daedaf60 f87535140c
189 changed files with 8377 additions and 1087 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -50,12 +50,15 @@ from tokenizers import Tokenizer
 import litellm
 import litellm._service_logger  # for storing API inputs, outputs, and metadata
 import litellm.litellm_core_utils
+import litellm.litellm_core_utils.json_validation_rule
 from litellm.caching import DualCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.litellm_core_utils.exception_mapping_utils import get_error_message
 from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe
 from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_logging,
 )
+from litellm.litellm_core_utils.token_counter import get_modified_max_tokens
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.utils import (
    CallTypes,
@ -580,7 +583,7 @@ def client(original_function):
        else:
            return False

-    def post_call_processing(original_response, model):
+    def post_call_processing(original_response, model, optional_params: Optional[dict]):
        try:
            if original_response is None:
                pass
@ -595,11 +598,47 @@ def client(original_function):
                        pass
                    else:
                        if isinstance(original_response, ModelResponse):
-                            model_response = original_response.choices[
+                            model_response: Optional[str] = original_response.choices[
                                0
-                            ].message.content
-                            ### POST-CALL RULES ###
-                            rules_obj.post_call_rules(input=model_response, model=model)
+                            ].message.content  # type: ignore
+                            if model_response is not None:
+                                ### POST-CALL RULES ###
+                                rules_obj.post_call_rules(
+                                    input=model_response, model=model
+                                )
+                                ### JSON SCHEMA VALIDATION ###
+                                if (
+                                    optional_params is not None
+                                    and "response_format" in optional_params
+                                    and isinstance(
+                                        optional_params["response_format"], dict
+                                    )
+                                    and "type" in optional_params["response_format"]
+                                    and optional_params["response_format"]["type"]
+                                    == "json_object"
+                                    and "response_schema"
+                                    in optional_params["response_format"]
+                                    and isinstance(
+                                        optional_params["response_format"][
+                                            "response_schema"
+                                        ],
+                                        dict,
+                                    )
+                                    and "enforce_validation"
+                                    in optional_params["response_format"]
+                                    and optional_params["response_format"][
+                                        "enforce_validation"
+                                    ]
+                                    is True
+                                ):
+                                    # schema given, json response expected, and validation enforced
+                                    litellm.litellm_core_utils.json_validation_rule.validate_schema(
+                                        schema=optional_params["response_format"][
+                                            "response_schema"
+                                        ],
+                                        response=model_response,
+                                    )
+
        except Exception as e:
            raise e

@ -815,7 +854,7 @@ def client(original_function):
                kwargs.get("max_tokens", None) is not None
                and model is not None
                and litellm.modify_params
-                == True  # user is okay with params being modified
+                is True  # user is okay with params being modified
                and (
                    call_type == CallTypes.acompletion.value
                    or call_type == CallTypes.completion.value
@ -825,28 +864,19 @@ def client(original_function):
                    base_model = model
                    if kwargs.get("hf_model_name", None) is not None:
                        base_model = f"huggingface/{kwargs.get('hf_model_name')}"
-                    max_output_tokens = (
-                        get_max_tokens(model=base_model) or 4096
-                    )  # assume min context window is 4k tokens
-                    user_max_tokens = kwargs.get("max_tokens")
-                    ## Scenario 1: User limit + prompt > model limit
                    messages = None
                    if len(args) > 1:
                        messages = args[1]
                    elif kwargs.get("messages", None):
                        messages = kwargs["messages"]
-                    input_tokens = token_counter(model=base_model, messages=messages)
-                    input_tokens += max(
-                        0.1 * input_tokens, 10
-                    )  # give at least a 10 token buffer. token counting can be imprecise.
-                    if input_tokens > max_output_tokens:
-                        pass  # allow call to fail normally
-                    elif user_max_tokens + input_tokens > max_output_tokens:
-                        user_max_tokens = max_output_tokens - input_tokens
-                    print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = int(
-                        round(user_max_tokens)
-                    )  # make sure max tokens is always an int
+                    user_max_tokens = kwargs.get("max_tokens")
+                    modified_max_tokens = get_modified_max_tokens(
+                        model=model,
+                        base_model=base_model,
+                        messages=messages,
+                        user_max_tokens=user_max_tokens,
+                    )
+                    kwargs["max_tokens"] = modified_max_tokens
                except Exception as e:
                    print_verbose(f"Error while checking max token limit: {str(e)}")
            # MODEL CALL
@ -877,7 +907,11 @@ def client(original_function):
                return result

            ### POST-CALL RULES ###
-            post_call_processing(original_response=result, model=model or None)
+            post_call_processing(
+                original_response=result,
+                model=model or None,
+                optional_params=kwargs,
+            )

            # [OPTIONAL] ADD TO CACHE
            if (
@ -901,6 +935,17 @@ def client(original_function):
                    model=model,
                    optional_params=getattr(logging_obj, "optional_params", {}),
                )
+                result._hidden_params["response_cost"] = (
+                    litellm.response_cost_calculator(
+                        response_object=result,
+                        model=getattr(logging_obj, "model", ""),
+                        custom_llm_provider=getattr(
+                            logging_obj, "custom_llm_provider", None
+                        ),
+                        call_type=getattr(logging_obj, "call_type", "completion"),
+                        optional_params=getattr(logging_obj, "optional_params", {}),
+                    )
+                )
            result._response_ms = (
                end_time - start_time
            ).total_seconds() * 1000  # return response latency in ms like openai
@ -1294,6 +1339,17 @@ def client(original_function):
                    model=model,
                    optional_params=kwargs,
                )
+                result._hidden_params["response_cost"] = (
+                    litellm.response_cost_calculator(
+                        response_object=result,
+                        model=getattr(logging_obj, "model", ""),
+                        custom_llm_provider=getattr(
+                            logging_obj, "custom_llm_provider", None
+                        ),
+                        call_type=getattr(logging_obj, "call_type", "completion"),
+                        optional_params=getattr(logging_obj, "optional_params", {}),
+                    )
+                )
            if (
                isinstance(result, ModelResponse)
                or isinstance(result, EmbeddingResponse)
@ -1304,7 +1360,9 @@ def client(original_function):
                ).total_seconds() * 1000  # return response latency in ms like openai

            ### POST-CALL RULES ###
-            post_call_processing(original_response=result, model=model)
+            post_call_processing(
+                original_response=result, model=model, optional_params=kwargs
+            )

            # [OPTIONAL] ADD TO CACHE
            if (
@ -1835,9 +1893,10 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->

    Parameters:
    model (str): The model name to be checked.
+    custom_llm_provider (str): The provider to be checked.

    Returns:
-    bool: True if the model supports function calling, False otherwise.
+    bool: True if the model supports system messages, False otherwise.

    Raises:
    Exception: If the given model is not found in model_prices_and_context_window.json.
@ -1855,6 +1914,43 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
        )


+def supports_response_schema(model: str, custom_llm_provider: Optional[str]) -> bool:
+    """
+    Check if the given model + provider supports 'response_schema' as a param.
+
+    Parameters:
+    model (str): The model name to be checked.
+    custom_llm_provider (str): The provider to be checked.
+
+    Returns:
+    bool: True if the model supports response_schema, False otherwise.
+
+    Does not raise error. Defaults to 'False'. Outputs logging.error.
+    """
+    try:
+        ## GET LLM PROVIDER ##
+        model, custom_llm_provider, _, _ = get_llm_provider(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        if custom_llm_provider == "predibase":  # predibase supports this globally
+            return True
+
+        ## GET MODEL INFO
+        model_info = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        if model_info.get("supports_response_schema", False) is True:
+            return True
+        return False
+    except Exception:
+        verbose_logger.error(
+            f"Model not in model_prices_and_context_window.json. You passed model={model}, custom_llm_provider={custom_llm_provider}."
+        )
+        return False
+
+
 def supports_function_calling(model: str) -> bool:
    """
    Check if the given model supports function calling and return a boolean value.
@ -2312,7 +2408,9 @@ def get_optional_params(
        elif k == "hf_model_name" and custom_llm_provider != "sagemaker":
            continue
        elif (
-            k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
+            k.startswith("vertex_")
+            and custom_llm_provider != "vertex_ai"
+            and custom_llm_provider != "vertex_ai_beta"
        ):  # allow dynamically setting vertex ai init logic
            continue
        passed_params[k] = v
@ -2415,6 +2513,7 @@ def get_optional_params(
            and custom_llm_provider != "together_ai"
            and custom_llm_provider != "groq"
            and custom_llm_provider != "nvidia_nim"
+            and custom_llm_provider != "volcengine"
            and custom_llm_provider != "deepseek"
            and custom_llm_provider != "codestral"
            and custom_llm_provider != "mistral"
@ -2743,6 +2842,11 @@ def get_optional_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
            model=model,
+            drop_params=(
+                drop_params
+                if drop_params is not None and isinstance(drop_params, bool)
+                else False
+            ),
        )
    elif (
        custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models
@ -2811,12 +2915,7 @@ def get_optional_params(
                            optional_params=optional_params,
                        )
                    )
-                else:
-                    optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
-                        non_default_params=non_default_params,
-                        optional_params=optional_params,
-                    )
-            else:  # bedrock httpx route
+            elif model in litellm.BEDROCK_CONVERSE_MODELS:
                optional_params = litellm.AmazonConverseConfig().map_openai_params(
                    model=model,
                    non_default_params=non_default_params,
@ -2827,6 +2926,11 @@ def get_optional_params(
                        else False
                    ),
                )
+            else:
+                optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
+                    non_default_params=non_default_params,
+                    optional_params=optional_params,
+                )
        elif "amazon" in model:  # amazon titan llms
            _check_valid_arg(supported_params=supported_params)
            # see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
@ -3091,6 +3195,17 @@ def get_optional_params(
            optional_params=optional_params,
            model=model,
        )
+    elif custom_llm_provider == "volcengine":
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.VolcEngineConfig().map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            model=model,
+        )
+
    elif custom_llm_provider == "groq":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
@ -3661,6 +3776,8 @@ def get_supported_openai_params(
        return litellm.FireworksAIConfig().get_supported_openai_params()
    elif custom_llm_provider == "nvidia_nim":
        return litellm.NvidiaNimConfig().get_supported_openai_params()
+    elif custom_llm_provider == "volcengine":
+        return litellm.VolcEngineConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "groq":
        return [
            "temperature",
@ -3672,6 +3789,8 @@ def get_supported_openai_params(
            "tool_choice",
            "response_format",
            "seed",
+            "extra_headers",
+            "extra_body",
        ]
    elif custom_llm_provider == "deepseek":
        return [
@ -3727,23 +3846,18 @@ def get_supported_openai_params(
        return litellm.AzureOpenAIConfig().get_supported_openai_params()
    elif custom_llm_provider == "openrouter":
        return [
-            "functions",
-            "function_call",
            "temperature",
            "top_p",
-            "n",
-            "stream",
-            "stop",
-            "max_tokens",
-            "presence_penalty",
            "frequency_penalty",
-            "logit_bias",
-            "user",
-            "response_format",
+            "presence_penalty",
+            "repetition_penalty",
            "seed",
-            "tools",
-            "tool_choice",
-            "max_retries",
+            "max_tokens",
+            "logit_bias",
+            "logprobs",
+            "top_logprobs",
+            "response_format",
+            "stop",
        ]
    elif custom_llm_provider == "mistral" or custom_llm_provider == "codestral":
        # mistal and codestral api have the exact same params
@ -3761,6 +3875,10 @@ def get_supported_openai_params(
            "top_p",
            "stop",
            "seed",
+            "tools",
+            "tool_choice",
+            "functions",
+            "function_call",
        ]
    elif custom_llm_provider == "huggingface":
        return litellm.HuggingfaceConfig().get_supported_openai_params()
@ -4025,6 +4143,10 @@ def get_llm_provider(
                # nvidia_nim is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1
                api_base = "https://integrate.api.nvidia.com/v1"
                dynamic_api_key = get_secret("NVIDIA_NIM_API_KEY")
+            elif custom_llm_provider == "volcengine":
+                # volcengine is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1
+                api_base = "https://ark.cn-beijing.volces.com/api/v3"
+                dynamic_api_key = get_secret("VOLCENGINE_API_KEY")
            elif custom_llm_provider == "codestral":
                # codestral is openai compatible, we just need to set this to custom_openai and have the api_base be https://codestral.mistral.ai/v1
                api_base = "https://codestral.mistral.ai/v1"
@ -4334,7 +4456,7 @@ def get_utc_datetime():
        return datetime.utcnow()  # type: ignore


-def get_max_tokens(model: str):
+def get_max_tokens(model: str) -> Optional[int]:
    """
    Get the maximum number of output tokens allowed for a given model.

@ -4388,7 +4510,8 @@ def get_max_tokens(model: str):
                return litellm.model_cost[model]["max_tokens"]
        else:
            raise Exception()
-    except:
+        return None
+    except Exception:
        raise Exception(
            f"Model {model} isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
        )
@ -4396,8 +4519,7 @@ def get_max_tokens(model: str):

 def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> ModelInfo:
    """
-    Get a dict for the maximum tokens (context window),
-    input_cost_per_token, output_cost_per_token  for a given model.
+    Get a dict for the maximum tokens (context window), input_cost_per_token, output_cost_per_token  for a given model.

    Parameters:
    - model (str): The name of the model.
@ -4482,6 +4604,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                mode="chat",
                supported_openai_params=supported_openai_params,
                supports_system_messages=None,
+                supports_response_schema=None,
            )
        else:
            """
@ -4503,36 +4626,6 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                        pass
                    else:
                        raise Exception
-                return ModelInfo(
-                    max_tokens=_model_info.get("max_tokens", None),
-                    max_input_tokens=_model_info.get("max_input_tokens", None),
-                    max_output_tokens=_model_info.get("max_output_tokens", None),
-                    input_cost_per_token=_model_info.get("input_cost_per_token", 0),
-                    input_cost_per_character=_model_info.get(
-                        "input_cost_per_character", None
-                    ),
-                    input_cost_per_token_above_128k_tokens=_model_info.get(
-                        "input_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_token=_model_info.get("output_cost_per_token", 0),
-                    output_cost_per_character=_model_info.get(
-                        "output_cost_per_character", None
-                    ),
-                    output_cost_per_token_above_128k_tokens=_model_info.get(
-                        "output_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_character_above_128k_tokens=_model_info.get(
-                        "output_cost_per_character_above_128k_tokens", None
-                    ),
-                    litellm_provider=_model_info.get(
-                        "litellm_provider", custom_llm_provider
-                    ),
-                    mode=_model_info.get("mode"),
-                    supported_openai_params=supported_openai_params,
-                    supports_system_messages=_model_info.get(
-                        "supports_system_messages", None
-                    ),
-                )
            elif model in litellm.model_cost:
                _model_info = litellm.model_cost[model]
                _model_info["supported_openai_params"] = supported_openai_params
@ -4546,36 +4639,6 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                        pass
                    else:
                        raise Exception
-                return ModelInfo(
-                    max_tokens=_model_info.get("max_tokens", None),
-                    max_input_tokens=_model_info.get("max_input_tokens", None),
-                    max_output_tokens=_model_info.get("max_output_tokens", None),
-                    input_cost_per_token=_model_info.get("input_cost_per_token", 0),
-                    input_cost_per_character=_model_info.get(
-                        "input_cost_per_character", None
-                    ),
-                    input_cost_per_token_above_128k_tokens=_model_info.get(
-                        "input_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_token=_model_info.get("output_cost_per_token", 0),
-                    output_cost_per_character=_model_info.get(
-                        "output_cost_per_character", None
-                    ),
-                    output_cost_per_token_above_128k_tokens=_model_info.get(
-                        "output_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_character_above_128k_tokens=_model_info.get(
-                        "output_cost_per_character_above_128k_tokens", None
-                    ),
-                    litellm_provider=_model_info.get(
-                        "litellm_provider", custom_llm_provider
-                    ),
-                    mode=_model_info.get("mode"),
-                    supported_openai_params=supported_openai_params,
-                    supports_system_messages=_model_info.get(
-                        "supports_system_messages", None
-                    ),
-                )
            elif split_model in litellm.model_cost:
                _model_info = litellm.model_cost[split_model]
                _model_info["supported_openai_params"] = supported_openai_params
@ -4589,40 +4652,48 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                        pass
                    else:
                        raise Exception
-                return ModelInfo(
-                    max_tokens=_model_info.get("max_tokens", None),
-                    max_input_tokens=_model_info.get("max_input_tokens", None),
-                    max_output_tokens=_model_info.get("max_output_tokens", None),
-                    input_cost_per_token=_model_info.get("input_cost_per_token", 0),
-                    input_cost_per_character=_model_info.get(
-                        "input_cost_per_character", None
-                    ),
-                    input_cost_per_token_above_128k_tokens=_model_info.get(
-                        "input_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_token=_model_info.get("output_cost_per_token", 0),
-                    output_cost_per_character=_model_info.get(
-                        "output_cost_per_character", None
-                    ),
-                    output_cost_per_token_above_128k_tokens=_model_info.get(
-                        "output_cost_per_token_above_128k_tokens", None
-                    ),
-                    output_cost_per_character_above_128k_tokens=_model_info.get(
-                        "output_cost_per_character_above_128k_tokens", None
-                    ),
-                    litellm_provider=_model_info.get(
-                        "litellm_provider", custom_llm_provider
-                    ),
-                    mode=_model_info.get("mode"),
-                    supported_openai_params=supported_openai_params,
-                    supports_system_messages=_model_info.get(
-                        "supports_system_messages", None
-                    ),
-                )
            else:
                raise ValueError(
                    "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
                )
+
+            ## PROVIDER-SPECIFIC INFORMATION
+            if custom_llm_provider == "predibase":
+                _model_info["supports_response_schema"] = True
+
+            return ModelInfo(
+                max_tokens=_model_info.get("max_tokens", None),
+                max_input_tokens=_model_info.get("max_input_tokens", None),
+                max_output_tokens=_model_info.get("max_output_tokens", None),
+                input_cost_per_token=_model_info.get("input_cost_per_token", 0),
+                input_cost_per_character=_model_info.get(
+                    "input_cost_per_character", None
+                ),
+                input_cost_per_token_above_128k_tokens=_model_info.get(
+                    "input_cost_per_token_above_128k_tokens", None
+                ),
+                output_cost_per_token=_model_info.get("output_cost_per_token", 0),
+                output_cost_per_character=_model_info.get(
+                    "output_cost_per_character", None
+                ),
+                output_cost_per_token_above_128k_tokens=_model_info.get(
+                    "output_cost_per_token_above_128k_tokens", None
+                ),
+                output_cost_per_character_above_128k_tokens=_model_info.get(
+                    "output_cost_per_character_above_128k_tokens", None
+                ),
+                litellm_provider=_model_info.get(
+                    "litellm_provider", custom_llm_provider
+                ),
+                mode=_model_info.get("mode"),
+                supported_openai_params=supported_openai_params,
+                supports_system_messages=_model_info.get(
+                    "supports_system_messages", None
+                ),
+                supports_response_schema=_model_info.get(
+                    "supports_response_schema", None
+                ),
+            )
    except Exception:
        raise Exception(
            "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
@ -4746,6 +4817,12 @@ def function_to_dict(input_function):  # noqa: C901
    return result


+def modify_url(original_url, new_path):
+    url = httpx.URL(original_url)
+    modified_url = url.copy_with(path=new_path)
+    return str(modified_url)
+
+
 def load_test_model(
    model: str,
    custom_llm_provider: str = "",
@ -4975,6 +5052,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
                keys_in_environment = True
            else:
                missing_keys.append("NVIDIA_NIM_API_KEY")
+        elif custom_llm_provider == "volcengine":
+            if "VOLCENGINE_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("VOLCENGINE_API_KEY")
        elif (
            custom_llm_provider == "codestral"
            or custom_llm_provider == "text-completion-codestral"
@ -5263,6 +5345,27 @@ def convert_to_model_response_object(
    hidden_params: Optional[dict] = None,
 ):
    received_args = locals()
+    ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
+    if (
+        response_object is not None
+        and "error" in response_object
+        and response_object["error"] is not None
+    ):
+        error_args = {"status_code": 422, "message": "Error in response object"}
+        if isinstance(response_object["error"], dict):
+            if "code" in response_object["error"]:
+                error_args["status_code"] = response_object["error"]["code"]
+            if "message" in response_object["error"]:
+                if isinstance(response_object["error"]["message"], dict):
+                    message_str = json.dumps(response_object["error"]["message"])
+                else:
+                    message_str = str(response_object["error"]["message"])
+                error_args["message"] = message_str
+        raised_exception = Exception()
+        setattr(raised_exception, "status_code", error_args["status_code"])
+        setattr(raised_exception, "message", error_args["message"])
+        raise raised_exception
+
    try:
        if response_type == "completion" and (
            model_response_object is None
@ -5718,7 +5821,10 @@ def exception_type(
        print()  # noqa
    try:
        if model:
-            error_str = str(original_exception)
+            if hasattr(original_exception, "message"):
+                error_str = str(original_exception.message)
+            else:
+                error_str = str(original_exception)
            if isinstance(original_exception, BaseException):
                exception_type = type(original_exception).__name__
            else:
@ -5740,6 +5846,18 @@ def exception_type(
                _model_group = _metadata.get("model_group")
                _deployment = _metadata.get("deployment")
                extra_information = f"\nModel: {model}"
+
+                exception_provider = "Unknown"
+                if (
+                    isinstance(custom_llm_provider, str)
+                    and len(custom_llm_provider) > 0
+                ):
+                    exception_provider = (
+                        custom_llm_provider[0].upper()
+                        + custom_llm_provider[1:]
+                        + "Exception"
+                    )
+
                if _api_base:
                    extra_information += f"\nAPI Base: `{_api_base}`"
                if (
@ -5790,10 +5908,13 @@ def exception_type(
                or custom_llm_provider in litellm.openai_compatible_providers
            ):
                # custom_llm_provider is openai, make it OpenAI
-                if hasattr(original_exception, "message"):
-                    message = original_exception.message
-                else:
-                    message = str(original_exception)
+                message = get_error_message(error_obj=original_exception)
+                if message is None:
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+
                if message is not None and isinstance(message, str):
                    message = message.replace("OPENAI", custom_llm_provider.upper())
                    message = message.replace("openai", custom_llm_provider)
@ -6126,7 +6247,6 @@ def exception_type(
                        )
                    elif (
                        original_exception.status_code == 400
-                        or original_exception.status_code == 422
                        or original_exception.status_code == 413
                    ):
                        exception_mapping_worked = True
@ -6136,6 +6256,14 @@ def exception_type(
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise UnprocessableEntityError(
+                            message=f"ReplicateException - {original_exception.message}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
                    elif original_exception.status_code == 408:
                        exception_mapping_worked = True
                        raise Timeout(
@ -7239,10 +7367,17 @@ def exception_type(
                            request=original_exception.request,
                        )
            elif custom_llm_provider == "azure":
+                message = get_error_message(error_obj=original_exception)
+                if message is None:
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+
                if "Internal server error" in error_str:
                    exception_mapping_worked = True
                    raise litellm.InternalServerError(
-                        message=f"AzureException Internal server error - {original_exception.message}",
+                        message=f"AzureException Internal server error - {message}",
                        llm_provider="azure",
                        model=model,
                        litellm_debug_info=extra_information,
@ -7255,7 +7390,7 @@ def exception_type(
                elif "This model's maximum context length is" in error_str:
                    exception_mapping_worked = True
                    raise ContextWindowExceededError(
-                        message=f"AzureException ContextWindowExceededError - {original_exception.message}",
+                        message=f"AzureException ContextWindowExceededError - {message}",
                        llm_provider="azure",
                        model=model,
                        litellm_debug_info=extra_information,
@ -7264,7 +7399,7 @@ def exception_type(
                elif "DeploymentNotFound" in error_str:
                    exception_mapping_worked = True
                    raise NotFoundError(
-                        message=f"AzureException NotFoundError - {original_exception.message}",
+                        message=f"AzureException NotFoundError - {message}",
                        llm_provider="azure",
                        model=model,
                        litellm_debug_info=extra_information,
@ -7284,7 +7419,7 @@ def exception_type(
                ):
                    exception_mapping_worked = True
                    raise ContentPolicyViolationError(
-                        message=f"litellm.ContentPolicyViolationError: AzureException - {original_exception.message}",
+                        message=f"litellm.ContentPolicyViolationError: AzureException - {message}",
                        llm_provider="azure",
                        model=model,
                        litellm_debug_info=extra_information,
@ -7293,7 +7428,7 @@ def exception_type(
                elif "invalid_request_error" in error_str:
                    exception_mapping_worked = True
                    raise BadRequestError(
-                        message=f"AzureException BadRequestError - {original_exception.message}",
+                        message=f"AzureException BadRequestError - {message}",
                        llm_provider="azure",
                        model=model,
                        litellm_debug_info=extra_information,
@ -7305,7 +7440,7 @@ def exception_type(
                ):
                    exception_mapping_worked = True
                    raise AuthenticationError(
-                        message=f"{exception_provider} AuthenticationError - {original_exception.message}",
+                        message=f"{exception_provider} AuthenticationError - {message}",
                        llm_provider=custom_llm_provider,
                        model=model,
                        litellm_debug_info=extra_information,
@ -7316,7 +7451,7 @@ def exception_type(
                    if original_exception.status_code == 400:
                        exception_mapping_worked = True
                        raise BadRequestError(
-                            message=f"AzureException - {original_exception.message}",
+                            message=f"AzureException - {message}",
                            llm_provider="azure",
                            model=model,
                            litellm_debug_info=extra_information,
@ -7325,7 +7460,7 @@ def exception_type(
                    elif original_exception.status_code == 401:
                        exception_mapping_worked = True
                        raise AuthenticationError(
-                            message=f"AzureException AuthenticationError - {original_exception.message}",
+                            message=f"AzureException AuthenticationError - {message}",
                            llm_provider="azure",
                            model=model,
                            litellm_debug_info=extra_information,
@ -7334,7 +7469,7 @@ def exception_type(
                    elif original_exception.status_code == 408:
                        exception_mapping_worked = True
                        raise Timeout(
-                            message=f"AzureException Timeout - {original_exception.message}",
+                            message=f"AzureException Timeout - {message}",
                            model=model,
                            litellm_debug_info=extra_information,
                            llm_provider="azure",
@ -7342,7 +7477,7 @@ def exception_type(
                    elif original_exception.status_code == 422:
                        exception_mapping_worked = True
                        raise BadRequestError(
-                            message=f"AzureException BadRequestError - {original_exception.message}",
+                            message=f"AzureException BadRequestError - {message}",
                            model=model,
                            llm_provider="azure",
                            litellm_debug_info=extra_information,
@ -7351,7 +7486,7 @@ def exception_type(
                    elif original_exception.status_code == 429:
                        exception_mapping_worked = True
                        raise RateLimitError(
-                            message=f"AzureException RateLimitError - {original_exception.message}",
+                            message=f"AzureException RateLimitError - {message}",
                            model=model,
                            llm_provider="azure",
                            litellm_debug_info=extra_information,
@ -7360,7 +7495,7 @@ def exception_type(
                    elif original_exception.status_code == 503:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
-                            message=f"AzureException ServiceUnavailableError - {original_exception.message}",
+                            message=f"AzureException ServiceUnavailableError - {message}",
                            model=model,
                            llm_provider="azure",
                            litellm_debug_info=extra_information,
@ -7369,7 +7504,7 @@ def exception_type(
                    elif original_exception.status_code == 504:  # gateway timeout error
                        exception_mapping_worked = True
                        raise Timeout(
-                            message=f"AzureException Timeout - {original_exception.message}",
+                            message=f"AzureException Timeout - {message}",
                            model=model,
                            litellm_debug_info=extra_information,
                            llm_provider="azure",
@ -7378,7 +7513,7 @@ def exception_type(
                        exception_mapping_worked = True
                        raise APIError(
                            status_code=original_exception.status_code,
-                            message=f"AzureException APIError - {original_exception.message}",
+                            message=f"AzureException APIError - {message}",
                            llm_provider="azure",
                            litellm_debug_info=extra_information,
                            model=model,
@ -7810,6 +7945,7 @@ class CustomStreamWrapper:
            "<s>",
            "</s>",
            "<|im_end|>",
+            "<|im_start|>",
        ]
        self.holding_chunk = ""
        self.complete_response = ""