diff --git a/litellm/tests/log.txt b/litellm/tests/log.txt
index 79aef9819..aa8ad3c36 100644
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
@@ -1,6 +1,5314 @@
-<litellm.utils.CustomStreamWrapper object at 0x118bd82d0>
-chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
-extracted chunk: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
-chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
-extracted chunk: 
-completion_response: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
+============================= test session starts ==============================
+platform darwin -- Python 3.11.6, pytest-7.3.1, pluggy-1.3.0
+rootdir: /Users/krrishdholakia/Documents/litellm/litellm/tests
+plugins: timeout-2.2.0, asyncio-0.23.2, anyio-3.7.1, xdist-3.3.1
+asyncio: mode=Mode.STRICT
+collected 1 item
+
+test_completion.py F                                                     [100%]
+
+=================================== FAILURES ===================================
+______________________ test_replicate_custom_prompt_dict _______________________
+
+model = 'meta/llama-2-7b-chat'
+messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
+timeout = 600.0, temperature = None, top_p = None, n = None, stream = None
+stop = None, max_tokens = None, presence_penalty = None
+frequency_penalty = None, logit_bias = None, user = None, response_format = None
+seed = None, tools = None, tool_choice = None, logprobs = None
+top_logprobs = None, deployment_id = None, extra_headers = None
+functions = None, function_call = None, base_url = None, api_version = None
+api_key = None, model_list = None
+kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x1043e1550>, 'num_retries': 3, 'repetition_penalty': 0.1}
+args = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
+api_base = 'https://api.replicate.com/v1', mock_response = None
+force_timeout = 600, logger_fn = None, verbose = False
+custom_llm_provider = 'replicate'
+
+    @client
+    def completion(
+        model: str,
+        # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
+        messages: List = [],
+        timeout: Optional[Union[float, int]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        n: Optional[int] = None,
+        stream: Optional[bool] = None,
+        stop=None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[dict] = None,
+        user: Optional[str] = None,
+        # openai v1.0+ new params
+        response_format: Optional[dict] = None,
+        seed: Optional[int] = None,
+        tools: Optional[List] = None,
+        tool_choice: Optional[str] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        deployment_id=None,
+        extra_headers: Optional[dict] = None,
+        # soon to be deprecated params by OpenAI
+        functions: Optional[List] = None,
+        function_call: Optional[str] = None,
+        # set api_base, api_version, api_key
+        base_url: Optional[str] = None,
+        api_version: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
+        # Optional liteLLM function params
+        **kwargs,
+    ) -> Union[ModelResponse, CustomStreamWrapper]:
+        """
+        Perform a completion() using any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)
+        Parameters:
+            model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
+            messages (List): A list of message objects representing the conversation context (default is an empty list).
+    
+            OPTIONAL PARAMS
+            functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
+            function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
+            temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
+            top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
+            n (int, optional): The number of completions to generate (default is 1).
+            stream (bool, optional): If True, return a streaming response (default is False).
+            stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
+            max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+            presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
+            frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
+            logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
+            user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
+            logprobs (bool, optional): Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message
+            top_logprobs (int, optional): An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
+            metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc.
+            api_base (str, optional): Base URL for the API (default is None).
+            api_version (str, optional): API version (default is None).
+            api_key (str, optional): API key (default is None).
+            model_list (list, optional): List of api base, version, keys
+            extra_headers (dict, optional): Additional headers to include in the request.
+    
+            LITELLM Specific Params
+            mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
+            custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
+            max_retries (int, optional): The number of retries to attempt (default is 0).
+        Returns:
+            ModelResponse: A response object containing the generated completion and associated metadata.
+    
+        Note:
+            - This function is used to perform completions() using the specified language model.
+            - It supports various optional parameters for customizing the completion behavior.
+            - If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
+        """
+        ######### unpacking kwargs #####################
+        args = locals()
+        api_base = kwargs.get("api_base", None)
+        mock_response = kwargs.get("mock_response", None)
+        force_timeout = kwargs.get("force_timeout", 600)  ## deprecated
+        logger_fn = kwargs.get("logger_fn", None)
+        verbose = kwargs.get("verbose", False)
+        custom_llm_provider = kwargs.get("custom_llm_provider", None)
+        litellm_logging_obj = kwargs.get("litellm_logging_obj", None)
+        id = kwargs.get("id", None)
+        metadata = kwargs.get("metadata", None)
+        model_info = kwargs.get("model_info", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        fallbacks = kwargs.get("fallbacks", None)
+        headers = kwargs.get("headers", None)
+        num_retries = kwargs.get("num_retries", None)  ## deprecated
+        max_retries = kwargs.get("max_retries", None)
+        context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
+        organization = kwargs.get("organization", None)
+        ### CUSTOM MODEL COST ###
+        input_cost_per_token = kwargs.get("input_cost_per_token", None)
+        output_cost_per_token = kwargs.get("output_cost_per_token", None)
+        input_cost_per_second = kwargs.get("input_cost_per_second", None)
+        output_cost_per_second = kwargs.get("output_cost_per_second", None)
+        ### CUSTOM PROMPT TEMPLATE ###
+        initial_prompt_value = kwargs.get("initial_prompt_value", None)
+        roles = kwargs.get("roles", None)
+        final_prompt_value = kwargs.get("final_prompt_value", None)
+        bos_token = kwargs.get("bos_token", None)
+        eos_token = kwargs.get("eos_token", None)
+        preset_cache_key = kwargs.get("preset_cache_key", None)
+        hf_model_name = kwargs.get("hf_model_name", None)
+        ### TEXT COMPLETION CALLS ###
+        text_completion = kwargs.get("text_completion", False)
+        atext_completion = kwargs.get("atext_completion", False)
+        ### ASYNC CALLS ###
+        acompletion = kwargs.get("acompletion", False)
+        client = kwargs.get("client", None)
+        ### Admin Controls ###
+        no_log = kwargs.get("no-log", False)
+        ######## end of unpacking kwargs ###########
+        openai_params = [
+            "functions",
+            "function_call",
+            "temperature",
+            "temperature",
+            "top_p",
+            "n",
+            "stream",
+            "stop",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "user",
+            "request_timeout",
+            "api_base",
+            "api_version",
+            "api_key",
+            "deployment_id",
+            "organization",
+            "base_url",
+            "default_headers",
+            "timeout",
+            "response_format",
+            "seed",
+            "tools",
+            "tool_choice",
+            "max_retries",
+            "logprobs",
+            "top_logprobs",
+            "extra_headers",
+        ]
+        litellm_params = [
+            "metadata",
+            "acompletion",
+            "atext_completion",
+            "text_completion",
+            "caching",
+            "mock_response",
+            "api_key",
+            "api_version",
+            "api_base",
+            "force_timeout",
+            "logger_fn",
+            "verbose",
+            "custom_llm_provider",
+            "litellm_logging_obj",
+            "litellm_call_id",
+            "use_client",
+            "id",
+            "fallbacks",
+            "azure",
+            "headers",
+            "model_list",
+            "num_retries",
+            "context_window_fallback_dict",
+            "roles",
+            "final_prompt_value",
+            "bos_token",
+            "eos_token",
+            "request_timeout",
+            "complete_response",
+            "self",
+            "client",
+            "rpm",
+            "tpm",
+            "input_cost_per_token",
+            "output_cost_per_token",
+            "input_cost_per_second",
+            "output_cost_per_second",
+            "hf_model_name",
+            "model_info",
+            "proxy_server_request",
+            "preset_cache_key",
+            "caching_groups",
+            "ttl",
+            "cache",
+            "no-log",
+            "base_model",
+            "stream_timeout",
+        ]
+        default_params = openai_params + litellm_params
+        non_default_params = {
+            k: v for k, v in kwargs.items() if k not in default_params
+        }  # model-specific params - pass them straight to the model/provider
+        if timeout is None:
+            timeout = (
+                kwargs.get("request_timeout", None) or 600
+            )  # set timeout for 10 minutes by default
+        timeout = float(timeout)
+        try:
+            if base_url is not None:
+                api_base = base_url
+            if max_retries is not None:  # openai allows openai.OpenAI(max_retries=3)
+                num_retries = max_retries
+            logging = litellm_logging_obj
+            fallbacks = fallbacks or litellm.model_fallbacks
+            if fallbacks is not None:
+                return completion_with_fallbacks(**args)
+            if model_list is not None:
+                deployments = [
+                    m["litellm_params"] for m in model_list if m["model_name"] == model
+                ]
+                return batch_completion_models(deployments=deployments, **args)
+            if litellm.model_alias_map and model in litellm.model_alias_map:
+                model = litellm.model_alias_map[
+                    model
+                ]  # update the model to the actual value if an alias has been passed in
+            model_response = ModelResponse()
+            if (
+                kwargs.get("azure", False) == True
+            ):  # don't remove flag check, to remain backwards compatible for repos like Codium
+                custom_llm_provider = "azure"
+            if deployment_id != None:  # azure llms
+                model = deployment_id
+                custom_llm_provider = "azure"
+            model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(
+                model=model,
+                custom_llm_provider=custom_llm_provider,
+                api_base=api_base,
+                api_key=api_key,
+            )
+            if model_response is not None and hasattr(model_response, "_hidden_params"):
+                model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+                model_response._hidden_params["region_name"] = kwargs.get(
+                    "aws_region_name", None
+                )  # support region-based pricing for bedrock
+    
+            ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
+            if input_cost_per_token is not None and output_cost_per_token is not None:
+                print_verbose(f"Registering model={model} in model cost map")
+                litellm.register_model(
+                    {
+                        f"{custom_llm_provider}/{model}": {
+                            "input_cost_per_token": input_cost_per_token,
+                            "output_cost_per_token": output_cost_per_token,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                        model: {
+                            "input_cost_per_token": input_cost_per_token,
+                            "output_cost_per_token": output_cost_per_token,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                    }
+                )
+            elif (
+                input_cost_per_second is not None
+            ):  # time based pricing just needs cost in place
+                output_cost_per_second = output_cost_per_second
+                litellm.register_model(
+                    {
+                        f"{custom_llm_provider}/{model}": {
+                            "input_cost_per_second": input_cost_per_second,
+                            "output_cost_per_second": output_cost_per_second,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                        model: {
+                            "input_cost_per_second": input_cost_per_second,
+                            "output_cost_per_second": output_cost_per_second,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                    }
+                )
+            ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
+            custom_prompt_dict = {}  # type: ignore
+            if (
+                initial_prompt_value
+                or roles
+                or final_prompt_value
+                or bos_token
+                or eos_token
+            ):
+                custom_prompt_dict = {model: {}}
+                if initial_prompt_value:
+                    custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
+                if roles:
+                    custom_prompt_dict[model]["roles"] = roles
+                if final_prompt_value:
+                    custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
+                if bos_token:
+                    custom_prompt_dict[model]["bos_token"] = bos_token
+                if eos_token:
+                    custom_prompt_dict[model]["eos_token"] = eos_token
+            model_api_key = get_api_key(
+                llm_provider=custom_llm_provider, dynamic_api_key=api_key
+            )  # get the api key from the environment if required for the model
+    
+            if dynamic_api_key is not None:
+                api_key = dynamic_api_key
+            # check if user passed in any of the OpenAI optional params
+            optional_params = get_optional_params(
+                functions=functions,
+                function_call=function_call,
+                temperature=temperature,
+                top_p=top_p,
+                n=n,
+                stream=stream,
+                stop=stop,
+                max_tokens=max_tokens,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                logit_bias=logit_bias,
+                user=user,
+                # params to identify the model
+                model=model,
+                custom_llm_provider=custom_llm_provider,
+                response_format=response_format,
+                seed=seed,
+                tools=tools,
+                tool_choice=tool_choice,
+                max_retries=max_retries,
+                logprobs=logprobs,
+                top_logprobs=top_logprobs,
+                extra_headers=extra_headers,
+                **non_default_params,
+            )
+    
+            if litellm.add_function_to_prompt and optional_params.get(
+                "functions_unsupported_model", None
+            ):  # if user opts to add it to prompt, when API doesn't support function calling
+                functions_unsupported_model = optional_params.pop(
+                    "functions_unsupported_model"
+                )
+                messages = function_call_prompt(
+                    messages=messages, functions=functions_unsupported_model
+                )
+    
+            # For logging - save the values of the litellm-specific params passed in
+            litellm_params = get_litellm_params(
+                acompletion=acompletion,
+                api_key=api_key,
+                force_timeout=force_timeout,
+                logger_fn=logger_fn,
+                verbose=verbose,
+                custom_llm_provider=custom_llm_provider,
+                api_base=api_base,
+                litellm_call_id=kwargs.get("litellm_call_id", None),
+                model_alias_map=litellm.model_alias_map,
+                completion_call_id=id,
+                metadata=metadata,
+                model_info=model_info,
+                proxy_server_request=proxy_server_request,
+                preset_cache_key=preset_cache_key,
+                no_log=no_log,
+            )
+            logging.update_environment_variables(
+                model=model,
+                user=user,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+            )
+            if mock_response:
+                return mock_completion(
+                    model,
+                    messages,
+                    stream=stream,
+                    mock_response=mock_response,
+                    logging=logging,
+                    acompletion=acompletion,
+                )
+            if custom_llm_provider == "azure":
+                # azure configs
+                api_type = get_secret("AZURE_API_TYPE") or "azure"
+    
+                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+    
+                api_version = (
+                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+                )
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.azure_key
+                    or get_secret("AZURE_OPENAI_API_KEY")
+                    or get_secret("AZURE_API_KEY")
+                )
+    
+                azure_ad_token = optional_params.get("extra_body", {}).pop(
+                    "azure_ad_token", None
+                ) or get_secret("AZURE_AD_TOKEN")
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.AzureOpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                response = azure_chat_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    api_key=api_key,
+                    api_base=api_base,
+                    api_version=api_version,
+                    api_type=api_type,
+                    azure_ad_token=azure_ad_token,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    timeout=timeout,
+                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
+                )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={
+                            "headers": headers,
+                            "api_version": api_version,
+                            "api_base": api_base,
+                        },
+                    )
+            elif custom_llm_provider == "azure_text":
+                # azure configs
+                api_type = get_secret("AZURE_API_TYPE") or "azure"
+    
+                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+    
+                api_version = (
+                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+                )
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.azure_key
+                    or get_secret("AZURE_OPENAI_API_KEY")
+                    or get_secret("AZURE_API_KEY")
+                )
+    
+                azure_ad_token = optional_params.get("extra_body", {}).pop(
+                    "azure_ad_token", None
+                ) or get_secret("AZURE_AD_TOKEN")
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.AzureOpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                response = azure_text_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    api_key=api_key,
+                    api_base=api_base,
+                    api_version=api_version,
+                    api_type=api_type,
+                    azure_ad_token=azure_ad_token,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    timeout=timeout,
+                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
+                )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={
+                            "headers": headers,
+                            "api_version": api_version,
+                            "api_base": api_base,
+                        },
+                    )
+            elif (
+                model in litellm.open_ai_chat_completion_models
+                or custom_llm_provider == "custom_openai"
+                or custom_llm_provider == "deepinfra"
+                or custom_llm_provider == "perplexity"
+                or custom_llm_provider == "groq"
+                or custom_llm_provider == "anyscale"
+                or custom_llm_provider == "mistral"
+                or custom_llm_provider == "openai"
+                or custom_llm_provider == "together_ai"
+                or custom_llm_provider in litellm.openai_compatible_providers
+                or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
+            ):  # allow user to make an openai call with a custom base
+                # note: if a user sets a custom base - we should ensure this works
+                # allow for the setting of dynamic and stateful api-bases
+                api_base = (
+                    api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+                    or litellm.api_base
+                    or get_secret("OPENAI_API_BASE")
+                    or "https://api.openai.com/v1"
+                )
+                openai.organization = (
+                    organization
+                    or litellm.organization
+                    or get_secret("OPENAI_ORGANIZATION")
+                    or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+                )
+                # set API KEY
+                api_key = (
+                    api_key
+                    or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                    or litellm.openai_key
+                    or get_secret("OPENAI_API_KEY")
+                )
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.OpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                try:
+                    response = openai_chat_completions.completion(
+                        model=model,
+                        messages=messages,
+                        headers=headers,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        api_key=api_key,
+                        api_base=api_base,
+                        acompletion=acompletion,
+                        logging_obj=logging,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        timeout=timeout,
+                        custom_prompt_dict=custom_prompt_dict,
+                        client=client,  # pass AsyncOpenAI, OpenAI client
+                        organization=organization,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                except Exception as e:
+                    ## LOGGING - log the original exception returned
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=str(e),
+                        additional_args={"headers": headers},
+                    )
+                    raise e
+    
+                if optional_params.get("stream", False):
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={"headers": headers},
+                    )
+            elif (
+                custom_llm_provider == "text-completion-openai"
+                or "ft:babbage-002" in model
+                or "ft:davinci-002" in model  # support for finetuned completion models
+            ):
+                openai.api_type = "openai"
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("OPENAI_API_BASE")
+                    or "https://api.openai.com/v1"
+                )
+    
+                openai.api_version = None
+                # set API KEY
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.openai_key
+                    or get_secret("OPENAI_API_KEY")
+                )
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.OpenAITextCompletionConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+                if litellm.organization:
+                    openai.organization = litellm.organization
+    
+                if (
+                    len(messages) > 0
+                    and "content" in messages[0]
+                    and type(messages[0]["content"]) == list
+                ):
+                    # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
+                    # https://platform.openai.com/docs/api-reference/completions/create
+                    prompt = messages[0]["content"]
+                else:
+                    prompt = " ".join([message["content"] for message in messages])  # type: ignore
+    
+                ## COMPLETION CALL
+                _response = openai_text_completions.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    api_key=api_key,
+                    api_base=api_base,
+                    acompletion=acompletion,
+                    client=client,  # pass AsyncOpenAI, OpenAI client
+                    logging_obj=logging,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    timeout=timeout,
+                )
+    
+                if (
+                    optional_params.get("stream", False) == False
+                    and acompletion == False
+                    and text_completion == False
+                ):
+                    # convert to chat completion response
+                    _response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
+                        response_object=_response, model_response_object=model_response
+                    )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=_response,
+                        additional_args={"headers": headers},
+                    )
+                response = _response
+            elif (
+                "replicate" in model
+                or custom_llm_provider == "replicate"
+                or model in litellm.replicate_models
+            ):
+                # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
+                replicate_key = None
+                replicate_key = (
+                    api_key
+                    or litellm.replicate_key
+                    or litellm.api_key
+                    or get_secret("REPLICATE_API_KEY")
+                    or get_secret("REPLICATE_API_TOKEN")
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("REPLICATE_API_BASE")
+                    or "https://api.replicate.com/v1"
+                )
+    
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+    
+>               model_response = replicate.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,  # for calculating input/output tokens
+                    api_key=replicate_key,
+                    logging_obj=logging,
+                    custom_prompt_dict=custom_prompt_dict,
+                )
+
+../main.py:1123: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+model = 'meta/llama-2-7b-chat'
+messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
+api_base = 'https://api.replicate.com/v1'
+model_response = ModelResponse(id='chatcmpl-64c87434-83ce-4436-ac82-065d03e85dbd', choices=[Choices(finish_reason='stop', index=0, mess... role='assistant'))], created=1712723703, model=None, object='chat.completion', system_fingerprint=None, usage=Usage())
+print_verbose = <function print_verbose at 0x108fa1440>
+logging_obj = <litellm.utils.Logging object at 0x1043e1550>
+api_key = 'r8_KkH9pMk1MOj0GTBijCFEGx5RpcDWd6K2jGKQK'
+encoding = <Encoding 'cl100k_base'>
+custom_prompt_dict = {'meta/llama-2-7b-chat': {'final_prompt_value': 'Now answer as best you can:', 'initial_prompt_value': 'You are a good...S>>\n [/INST]\n', 'pre_message': '[INST] <<SYS>>\n'}, 'user': {'post_message': ' [/INST]', 'pre_message': '[INST] '}}}}
+optional_params = {'repetition_penalty': 0.1}
+litellm_params = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1/models/meta/llama-2-7b-chat', 'api_key': None, 'completion_call_id': None, ...}
+logger_fn = None
+
+    def completion(
+        model: str,
+        messages: list,
+        api_base: str,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        logging_obj,
+        api_key,
+        encoding,
+        custom_prompt_dict={},
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+    ):
+        # Start a prediction and get the prediction URL
+        version_id = model_to_version_id(model)
+        ## Load Config
+        config = litellm.ReplicateConfig.get_config()
+        for k, v in config.items():
+            if (
+                k not in optional_params
+            ):  # completion(top_k=3) > replicate_config(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+    
+        system_prompt = None
+        if optional_params is not None and "supports_system_prompt" in optional_params:
+            supports_sys_prompt = optional_params.pop("supports_system_prompt")
+        else:
+            supports_sys_prompt = False
+    
+        if supports_sys_prompt:
+            for i in range(len(messages)):
+                if messages[i]["role"] == "system":
+                    first_sys_message = messages.pop(i)
+                    system_prompt = first_sys_message["content"]
+                    break
+    
+        if model in custom_prompt_dict:
+            # check if the model has a registered custom prompt
+            model_prompt_details = custom_prompt_dict[model]
+            prompt = custom_prompt(
+                role_dict=model_prompt_details.get("roles", {}),
+                initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
+                final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
+                bos_token=model_prompt_details.get("bos_token", ""),
+                eos_token=model_prompt_details.get("eos_token", ""),
+                messages=messages,
+            )
+        else:
+            prompt = prompt_factory(model=model, messages=messages)
+    
+        # If system prompt is supported, and a system prompt is provided, use it
+        if system_prompt is not None:
+            input_data = {
+                "prompt": prompt,
+                "system_prompt": system_prompt,
+                **optional_params,
+            }
+        # Otherwise, use the prompt as is
+        else:
+            input_data = {"prompt": prompt, **optional_params}
+    
+        ## COMPLETION CALL
+        ## Replicate Compeltion calls have 2 steps
+        ## Step1: Start Prediction: gets a prediction url
+        ## Step2: Poll prediction url for response
+        ## Step2: is handled with and without streaming
+        model_response["created"] = int(
+            time.time()
+        )  # for pricing this must remain right before calling api
+        prediction_url = start_prediction(
+            version_id,
+            input_data,
+            api_key,
+            api_base,
+            logging_obj=logging_obj,
+            print_verbose=print_verbose,
+        )
+        print_verbose(prediction_url)
+    
+        # Handle the prediction response (streaming or non-streaming)
+        if "stream" in optional_params and optional_params["stream"] == True:
+            print_verbose("streaming request")
+            return handle_prediction_response_streaming(
+                prediction_url, api_key, print_verbose
+            )
+        else:
+>           result, logs = handle_prediction_response(
+                prediction_url, api_key, print_verbose
+            )
+
+../llms/replicate.py:307: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+prediction_url = 'https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg'
+api_token = 'r8_KkH9pMk1MOj0GTBijCFEGx5RpcDWd6K2jGKQK'
+print_verbose = <function print_verbose at 0x108fa1440>
+
+    def handle_prediction_response(prediction_url, api_token, print_verbose):
+        output_string = ""
+        headers = {
+            "Authorization": f"Token {api_token}",
+            "Content-Type": "application/json",
+        }
+    
+        status = ""
+        logs = ""
+        while True and (status not in ["succeeded", "failed", "canceled"]):
+            print_verbose(f"replicate: polling endpoint: {prediction_url}")
+            time.sleep(0.5)
+            response = requests.get(prediction_url, headers=headers)
+            if response.status_code == 200:
+                response_data = response.json()
+                if "output" in response_data:
+                    output_string = "".join(response_data["output"])
+                    print_verbose(f"Non-streamed output:{output_string}")
+                status = response_data.get("status", None)
+                logs = response_data.get("logs", "")
+                if status == "failed":
+                    replicate_error = response_data.get("error", "")
+>                   raise ReplicateError(
+                        status_code=400,
+                        message=f"Error: {replicate_error}, \nReplicate logs:{logs}",
+                    )
+E                   litellm.llms.replicate.ReplicateError: Error: Traceback (most recent call last):
+E                     2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                           at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                     1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                           at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                     0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                           at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                     File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                   TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!, 
+E                   Replicate logs:MLC is currently not using any LoRAs.
+E                   MLC: True
+E                   Your formatted prompt is:
+E                   [INST] <<SYS>>
+E                   You are a helpful, respectful and honest assistant.
+E                   <</SYS>>
+E                   You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
+E                   Not using LoRA
+E                   Traceback (most recent call last):
+E                   File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
+E                   for r in result:
+E                   File "/src/predict.py", line 198, in predict
+E                   for decoded_token in self.engine(
+E                   File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
+E                   for val in gen:
+E                   File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
+E                   self.cm.reset_chat(chat_config)
+E                   File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
+E                   self._load_json_override_func(user_chat_config_json_str, True)
+E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
+E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
+E                   File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
+E                   File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
+E                   raise py_err
+E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                   tvm._ffi.base.TVMError: Traceback (most recent call last):
+E                   2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                   at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                   1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                   at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                   0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                   at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                   TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!
+
+../llms/replicate.py:165: ReplicateError
+
+During handling of the above exception, another exception occurred:
+
+args = ()
+kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x...ssages': [{'content': 'what is yc write 1 paragraph', 'role': 'user'}], 'model': 'replicate/meta/llama-2-7b-chat', ...}
+result = None, start_time = datetime.datetime(2024, 4, 9, 21, 35, 3, 984661)
+logging_obj = <litellm.utils.Logging object at 0x1043e1550>
+call_type = 'completion', model = 'replicate/meta/llama-2-7b-chat'
+k = 'litellm_logging_obj'
+
+    @wraps(original_function)
+    def wrapper(*args, **kwargs):
+        # DO NOT MOVE THIS. It always needs to run first
+        # Check if this is an async function. If so only execute the async function
+        if (
+            kwargs.get("acompletion", False) == True
+            or kwargs.get("aembedding", False) == True
+            or kwargs.get("aimg_generation", False) == True
+            or kwargs.get("amoderation", False) == True
+            or kwargs.get("atext_completion", False) == True
+            or kwargs.get("atranscription", False) == True
+        ):
+            # [OPTIONAL] CHECK MAX RETRIES / REQUEST
+            if litellm.num_retries_per_request is not None:
+                # check if previous_models passed in as ['litellm_params']['metadata]['previous_models']
+                previous_models = kwargs.get("metadata", {}).get(
+                    "previous_models", None
+                )
+                if previous_models is not None:
+                    if litellm.num_retries_per_request <= len(previous_models):
+                        raise Exception(f"Max retries per request hit!")
+    
+            # MODEL CALL
+            result = original_function(*args, **kwargs)
+            if "stream" in kwargs and kwargs["stream"] == True:
+                if (
+                    "complete_response" in kwargs
+                    and kwargs["complete_response"] == True
+                ):
+                    chunks = []
+                    for idx, chunk in enumerate(result):
+                        chunks.append(chunk)
+                    return litellm.stream_chunk_builder(
+                        chunks, messages=kwargs.get("messages", None)
+                    )
+                else:
+                    return result
+            return result
+    
+        # Prints Exactly what was passed to litellm function - don't execute any logic here - it should just print
+        print_args_passed_to_litellm(original_function, args, kwargs)
+        start_time = datetime.datetime.now()
+        result = None
+        logging_obj = kwargs.get("litellm_logging_obj", None)
+    
+        # only set litellm_call_id if its not in kwargs
+        call_type = original_function.__name__
+        if "litellm_call_id" not in kwargs:
+            kwargs["litellm_call_id"] = str(uuid.uuid4())
+        try:
+            model = args[0] if len(args) > 0 else kwargs["model"]
+        except:
+            model = None
+            if (
+                call_type != CallTypes.image_generation.value
+                and call_type != CallTypes.text_completion.value
+            ):
+                raise ValueError("model param not passed in.")
+    
+        try:
+            if logging_obj is None:
+                logging_obj, kwargs = function_setup(start_time, *args, **kwargs)
+            kwargs["litellm_logging_obj"] = logging_obj
+    
+            # CHECK FOR 'os.environ/' in kwargs
+            for k, v in kwargs.items():
+                if v is not None and isinstance(v, str) and v.startswith("os.environ/"):
+                    kwargs[k] = litellm.get_secret(v)
+            # [OPTIONAL] CHECK BUDGET
+            if litellm.max_budget:
+                if litellm._current_cost > litellm.max_budget:
+                    raise BudgetExceededError(
+                        current_cost=litellm._current_cost,
+                        max_budget=litellm.max_budget,
+                    )
+    
+            # [OPTIONAL] CHECK MAX RETRIES / REQUEST
+            if litellm.num_retries_per_request is not None:
+                # check if previous_models passed in as ['litellm_params']['metadata]['previous_models']
+                previous_models = kwargs.get("metadata", {}).get(
+                    "previous_models", None
+                )
+                if previous_models is not None:
+                    if litellm.num_retries_per_request <= len(previous_models):
+                        raise Exception(f"Max retries per request hit!")
+    
+            # [OPTIONAL] CHECK CACHE
+            print_verbose(
+                f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
+            )
+            # if caching is false or cache["no-cache"]==True, don't run this
+            if (
+                (
+                    (
+                        kwargs.get("caching", None) is None
+                        and kwargs.get("cache", None) is None
+                        and litellm.cache is not None
+                    )
+                    or kwargs.get("caching", False) == True
+                    or (
+                        kwargs.get("cache", None) is not None
+                        and kwargs.get("cache", {}).get("no-cache", False) != True
+                    )
+                )
+                and kwargs.get("aembedding", False) != True
+                and kwargs.get("acompletion", False) != True
+                and kwargs.get("aimg_generation", False) != True
+                and kwargs.get("atranscription", False) != True
+            ):  # allow users to control returning cached responses from the completion function
+                # checking cache
+                print_verbose(f"INSIDE CHECKING CACHE")
+                if (
+                    litellm.cache is not None
+                    and str(original_function.__name__)
+                    in litellm.cache.supported_call_types
+                ):
+                    print_verbose(f"Checking Cache")
+                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                    kwargs["preset_cache_key"] = (
+                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                    )
+                    cached_result = litellm.cache.get_cache(*args, **kwargs)
+                    if cached_result != None:
+                        if "detail" in cached_result:
+                            # implies an error occurred
+                            pass
+                        else:
+                            call_type = original_function.__name__
+                            print_verbose(
+                                f"Cache Response Object routing: call_type - {call_type}; cached_result instace: {type(cached_result)}"
+                            )
+                            if call_type == CallTypes.completion.value and isinstance(
+                                cached_result, dict
+                            ):
+                                cached_result = convert_to_model_response_object(
+                                    response_object=cached_result,
+                                    model_response_object=ModelResponse(),
+                                    stream=kwargs.get("stream", False),
+                                )
+                                if kwargs.get("stream", False) == True:
+                                    cached_result = CustomStreamWrapper(
+                                        completion_stream=cached_result,
+                                        model=model,
+                                        custom_llm_provider="cached_response",
+                                        logging_obj=logging_obj,
+                                    )
+                            elif call_type == CallTypes.embedding.value and isinstance(
+                                cached_result, dict
+                            ):
+                                cached_result = convert_to_model_response_object(
+                                    response_object=cached_result,
+                                    response_type="embedding",
+                                )
+    
+                            # LOG SUCCESS
+                            cache_hit = True
+                            end_time = datetime.datetime.now()
+                            (
+                                model,
+                                custom_llm_provider,
+                                dynamic_api_key,
+                                api_base,
+                            ) = litellm.get_llm_provider(
+                                model=model,
+                                custom_llm_provider=kwargs.get(
+                                    "custom_llm_provider", None
+                                ),
+                                api_base=kwargs.get("api_base", None),
+                                api_key=kwargs.get("api_key", None),
+                            )
+                            print_verbose(
+                                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
+                            )
+                            logging_obj.update_environment_variables(
+                                model=model,
+                                user=kwargs.get("user", None),
+                                optional_params={},
+                                litellm_params={
+                                    "logger_fn": kwargs.get("logger_fn", None),
+                                    "acompletion": False,
+                                    "metadata": kwargs.get("metadata", {}),
+                                    "model_info": kwargs.get("model_info", {}),
+                                    "proxy_server_request": kwargs.get(
+                                        "proxy_server_request", None
+                                    ),
+                                    "preset_cache_key": kwargs.get(
+                                        "preset_cache_key", None
+                                    ),
+                                    "stream_response": kwargs.get(
+                                        "stream_response", {}
+                                    ),
+                                },
+                                input=kwargs.get("messages", ""),
+                                api_key=kwargs.get("api_key", None),
+                                original_response=str(cached_result),
+                                additional_args=None,
+                                stream=kwargs.get("stream", False),
+                            )
+                            threading.Thread(
+                                target=logging_obj.success_handler,
+                                args=(cached_result, start_time, end_time, cache_hit),
+                            ).start()
+                            return cached_result
+    
+            # CHECK MAX TOKENS
+            if (
+                kwargs.get("max_tokens", None) is not None
+                and model is not None
+                and litellm.modify_params
+                == True  # user is okay with params being modified
+                and (
+                    call_type == CallTypes.acompletion.value
+                    or call_type == CallTypes.completion.value
+                )
+            ):
+                try:
+                    base_model = model
+                    if kwargs.get("hf_model_name", None) is not None:
+                        base_model = f"huggingface/{kwargs.get('hf_model_name')}"
+                    max_output_tokens = (
+                        get_max_tokens(model=base_model) or 4096
+                    )  # assume min context window is 4k tokens
+                    user_max_tokens = kwargs.get("max_tokens")
+                    ## Scenario 1: User limit + prompt > model limit
+                    messages = None
+                    if len(args) > 1:
+                        messages = args[1]
+                    elif kwargs.get("messages", None):
+                        messages = kwargs["messages"]
+                    input_tokens = token_counter(model=base_model, messages=messages)
+                    input_tokens += max(
+                        0.1 * input_tokens, 10
+                    )  # give at least a 10 token buffer. token counting can be imprecise.
+                    if input_tokens > max_output_tokens:
+                        pass  # allow call to fail normally
+                    elif user_max_tokens + input_tokens > max_output_tokens:
+                        user_max_tokens = max_output_tokens - input_tokens
+                    print_verbose(f"user_max_tokens: {user_max_tokens}")
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
+                except Exception as e:
+                    print_verbose(f"Error while checking max token limit: {str(e)}")
+            # MODEL CALL
+>           result = original_function(*args, **kwargs)
+
+../utils.py:2846: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+model = 'meta/llama-2-7b-chat'
+messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
+timeout = 600.0, temperature = None, top_p = None, n = None, stream = None
+stop = None, max_tokens = None, presence_penalty = None
+frequency_penalty = None, logit_bias = None, user = None, response_format = None
+seed = None, tools = None, tool_choice = None, logprobs = None
+top_logprobs = None, deployment_id = None, extra_headers = None
+functions = None, function_call = None, base_url = None, api_version = None
+api_key = None, model_list = None
+kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x1043e1550>, 'num_retries': 3, 'repetition_penalty': 0.1}
+args = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
+api_base = 'https://api.replicate.com/v1', mock_response = None
+force_timeout = 600, logger_fn = None, verbose = False
+custom_llm_provider = 'replicate'
+
+    @client
+    def completion(
+        model: str,
+        # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
+        messages: List = [],
+        timeout: Optional[Union[float, int]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        n: Optional[int] = None,
+        stream: Optional[bool] = None,
+        stop=None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[dict] = None,
+        user: Optional[str] = None,
+        # openai v1.0+ new params
+        response_format: Optional[dict] = None,
+        seed: Optional[int] = None,
+        tools: Optional[List] = None,
+        tool_choice: Optional[str] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        deployment_id=None,
+        extra_headers: Optional[dict] = None,
+        # soon to be deprecated params by OpenAI
+        functions: Optional[List] = None,
+        function_call: Optional[str] = None,
+        # set api_base, api_version, api_key
+        base_url: Optional[str] = None,
+        api_version: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
+        # Optional liteLLM function params
+        **kwargs,
+    ) -> Union[ModelResponse, CustomStreamWrapper]:
+        """
+        Perform a completion() using any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)
+        Parameters:
+            model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
+            messages (List): A list of message objects representing the conversation context (default is an empty list).
+    
+            OPTIONAL PARAMS
+            functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
+            function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
+            temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
+            top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
+            n (int, optional): The number of completions to generate (default is 1).
+            stream (bool, optional): If True, return a streaming response (default is False).
+            stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
+            max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+            presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
+            frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
+            logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
+            user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
+            logprobs (bool, optional): Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message
+            top_logprobs (int, optional): An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
+            metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc.
+            api_base (str, optional): Base URL for the API (default is None).
+            api_version (str, optional): API version (default is None).
+            api_key (str, optional): API key (default is None).
+            model_list (list, optional): List of api base, version, keys
+            extra_headers (dict, optional): Additional headers to include in the request.
+    
+            LITELLM Specific Params
+            mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
+            custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
+            max_retries (int, optional): The number of retries to attempt (default is 0).
+        Returns:
+            ModelResponse: A response object containing the generated completion and associated metadata.
+    
+        Note:
+            - This function is used to perform completions() using the specified language model.
+            - It supports various optional parameters for customizing the completion behavior.
+            - If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
+        """
+        ######### unpacking kwargs #####################
+        args = locals()
+        api_base = kwargs.get("api_base", None)
+        mock_response = kwargs.get("mock_response", None)
+        force_timeout = kwargs.get("force_timeout", 600)  ## deprecated
+        logger_fn = kwargs.get("logger_fn", None)
+        verbose = kwargs.get("verbose", False)
+        custom_llm_provider = kwargs.get("custom_llm_provider", None)
+        litellm_logging_obj = kwargs.get("litellm_logging_obj", None)
+        id = kwargs.get("id", None)
+        metadata = kwargs.get("metadata", None)
+        model_info = kwargs.get("model_info", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        fallbacks = kwargs.get("fallbacks", None)
+        headers = kwargs.get("headers", None)
+        num_retries = kwargs.get("num_retries", None)  ## deprecated
+        max_retries = kwargs.get("max_retries", None)
+        context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
+        organization = kwargs.get("organization", None)
+        ### CUSTOM MODEL COST ###
+        input_cost_per_token = kwargs.get("input_cost_per_token", None)
+        output_cost_per_token = kwargs.get("output_cost_per_token", None)
+        input_cost_per_second = kwargs.get("input_cost_per_second", None)
+        output_cost_per_second = kwargs.get("output_cost_per_second", None)
+        ### CUSTOM PROMPT TEMPLATE ###
+        initial_prompt_value = kwargs.get("initial_prompt_value", None)
+        roles = kwargs.get("roles", None)
+        final_prompt_value = kwargs.get("final_prompt_value", None)
+        bos_token = kwargs.get("bos_token", None)
+        eos_token = kwargs.get("eos_token", None)
+        preset_cache_key = kwargs.get("preset_cache_key", None)
+        hf_model_name = kwargs.get("hf_model_name", None)
+        ### TEXT COMPLETION CALLS ###
+        text_completion = kwargs.get("text_completion", False)
+        atext_completion = kwargs.get("atext_completion", False)
+        ### ASYNC CALLS ###
+        acompletion = kwargs.get("acompletion", False)
+        client = kwargs.get("client", None)
+        ### Admin Controls ###
+        no_log = kwargs.get("no-log", False)
+        ######## end of unpacking kwargs ###########
+        openai_params = [
+            "functions",
+            "function_call",
+            "temperature",
+            "temperature",
+            "top_p",
+            "n",
+            "stream",
+            "stop",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "user",
+            "request_timeout",
+            "api_base",
+            "api_version",
+            "api_key",
+            "deployment_id",
+            "organization",
+            "base_url",
+            "default_headers",
+            "timeout",
+            "response_format",
+            "seed",
+            "tools",
+            "tool_choice",
+            "max_retries",
+            "logprobs",
+            "top_logprobs",
+            "extra_headers",
+        ]
+        litellm_params = [
+            "metadata",
+            "acompletion",
+            "atext_completion",
+            "text_completion",
+            "caching",
+            "mock_response",
+            "api_key",
+            "api_version",
+            "api_base",
+            "force_timeout",
+            "logger_fn",
+            "verbose",
+            "custom_llm_provider",
+            "litellm_logging_obj",
+            "litellm_call_id",
+            "use_client",
+            "id",
+            "fallbacks",
+            "azure",
+            "headers",
+            "model_list",
+            "num_retries",
+            "context_window_fallback_dict",
+            "roles",
+            "final_prompt_value",
+            "bos_token",
+            "eos_token",
+            "request_timeout",
+            "complete_response",
+            "self",
+            "client",
+            "rpm",
+            "tpm",
+            "input_cost_per_token",
+            "output_cost_per_token",
+            "input_cost_per_second",
+            "output_cost_per_second",
+            "hf_model_name",
+            "model_info",
+            "proxy_server_request",
+            "preset_cache_key",
+            "caching_groups",
+            "ttl",
+            "cache",
+            "no-log",
+            "base_model",
+            "stream_timeout",
+        ]
+        default_params = openai_params + litellm_params
+        non_default_params = {
+            k: v for k, v in kwargs.items() if k not in default_params
+        }  # model-specific params - pass them straight to the model/provider
+        if timeout is None:
+            timeout = (
+                kwargs.get("request_timeout", None) or 600
+            )  # set timeout for 10 minutes by default
+        timeout = float(timeout)
+        try:
+            if base_url is not None:
+                api_base = base_url
+            if max_retries is not None:  # openai allows openai.OpenAI(max_retries=3)
+                num_retries = max_retries
+            logging = litellm_logging_obj
+            fallbacks = fallbacks or litellm.model_fallbacks
+            if fallbacks is not None:
+                return completion_with_fallbacks(**args)
+            if model_list is not None:
+                deployments = [
+                    m["litellm_params"] for m in model_list if m["model_name"] == model
+                ]
+                return batch_completion_models(deployments=deployments, **args)
+            if litellm.model_alias_map and model in litellm.model_alias_map:
+                model = litellm.model_alias_map[
+                    model
+                ]  # update the model to the actual value if an alias has been passed in
+            model_response = ModelResponse()
+            if (
+                kwargs.get("azure", False) == True
+            ):  # don't remove flag check, to remain backwards compatible for repos like Codium
+                custom_llm_provider = "azure"
+            if deployment_id != None:  # azure llms
+                model = deployment_id
+                custom_llm_provider = "azure"
+            model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(
+                model=model,
+                custom_llm_provider=custom_llm_provider,
+                api_base=api_base,
+                api_key=api_key,
+            )
+            if model_response is not None and hasattr(model_response, "_hidden_params"):
+                model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+                model_response._hidden_params["region_name"] = kwargs.get(
+                    "aws_region_name", None
+                )  # support region-based pricing for bedrock
+    
+            ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
+            if input_cost_per_token is not None and output_cost_per_token is not None:
+                print_verbose(f"Registering model={model} in model cost map")
+                litellm.register_model(
+                    {
+                        f"{custom_llm_provider}/{model}": {
+                            "input_cost_per_token": input_cost_per_token,
+                            "output_cost_per_token": output_cost_per_token,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                        model: {
+                            "input_cost_per_token": input_cost_per_token,
+                            "output_cost_per_token": output_cost_per_token,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                    }
+                )
+            elif (
+                input_cost_per_second is not None
+            ):  # time based pricing just needs cost in place
+                output_cost_per_second = output_cost_per_second
+                litellm.register_model(
+                    {
+                        f"{custom_llm_provider}/{model}": {
+                            "input_cost_per_second": input_cost_per_second,
+                            "output_cost_per_second": output_cost_per_second,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                        model: {
+                            "input_cost_per_second": input_cost_per_second,
+                            "output_cost_per_second": output_cost_per_second,
+                            "litellm_provider": custom_llm_provider,
+                        },
+                    }
+                )
+            ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
+            custom_prompt_dict = {}  # type: ignore
+            if (
+                initial_prompt_value
+                or roles
+                or final_prompt_value
+                or bos_token
+                or eos_token
+            ):
+                custom_prompt_dict = {model: {}}
+                if initial_prompt_value:
+                    custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
+                if roles:
+                    custom_prompt_dict[model]["roles"] = roles
+                if final_prompt_value:
+                    custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
+                if bos_token:
+                    custom_prompt_dict[model]["bos_token"] = bos_token
+                if eos_token:
+                    custom_prompt_dict[model]["eos_token"] = eos_token
+            model_api_key = get_api_key(
+                llm_provider=custom_llm_provider, dynamic_api_key=api_key
+            )  # get the api key from the environment if required for the model
+    
+            if dynamic_api_key is not None:
+                api_key = dynamic_api_key
+            # check if user passed in any of the OpenAI optional params
+            optional_params = get_optional_params(
+                functions=functions,
+                function_call=function_call,
+                temperature=temperature,
+                top_p=top_p,
+                n=n,
+                stream=stream,
+                stop=stop,
+                max_tokens=max_tokens,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                logit_bias=logit_bias,
+                user=user,
+                # params to identify the model
+                model=model,
+                custom_llm_provider=custom_llm_provider,
+                response_format=response_format,
+                seed=seed,
+                tools=tools,
+                tool_choice=tool_choice,
+                max_retries=max_retries,
+                logprobs=logprobs,
+                top_logprobs=top_logprobs,
+                extra_headers=extra_headers,
+                **non_default_params,
+            )
+    
+            if litellm.add_function_to_prompt and optional_params.get(
+                "functions_unsupported_model", None
+            ):  # if user opts to add it to prompt, when API doesn't support function calling
+                functions_unsupported_model = optional_params.pop(
+                    "functions_unsupported_model"
+                )
+                messages = function_call_prompt(
+                    messages=messages, functions=functions_unsupported_model
+                )
+    
+            # For logging - save the values of the litellm-specific params passed in
+            litellm_params = get_litellm_params(
+                acompletion=acompletion,
+                api_key=api_key,
+                force_timeout=force_timeout,
+                logger_fn=logger_fn,
+                verbose=verbose,
+                custom_llm_provider=custom_llm_provider,
+                api_base=api_base,
+                litellm_call_id=kwargs.get("litellm_call_id", None),
+                model_alias_map=litellm.model_alias_map,
+                completion_call_id=id,
+                metadata=metadata,
+                model_info=model_info,
+                proxy_server_request=proxy_server_request,
+                preset_cache_key=preset_cache_key,
+                no_log=no_log,
+            )
+            logging.update_environment_variables(
+                model=model,
+                user=user,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+            )
+            if mock_response:
+                return mock_completion(
+                    model,
+                    messages,
+                    stream=stream,
+                    mock_response=mock_response,
+                    logging=logging,
+                    acompletion=acompletion,
+                )
+            if custom_llm_provider == "azure":
+                # azure configs
+                api_type = get_secret("AZURE_API_TYPE") or "azure"
+    
+                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+    
+                api_version = (
+                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+                )
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.azure_key
+                    or get_secret("AZURE_OPENAI_API_KEY")
+                    or get_secret("AZURE_API_KEY")
+                )
+    
+                azure_ad_token = optional_params.get("extra_body", {}).pop(
+                    "azure_ad_token", None
+                ) or get_secret("AZURE_AD_TOKEN")
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.AzureOpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                response = azure_chat_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    api_key=api_key,
+                    api_base=api_base,
+                    api_version=api_version,
+                    api_type=api_type,
+                    azure_ad_token=azure_ad_token,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    timeout=timeout,
+                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
+                )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={
+                            "headers": headers,
+                            "api_version": api_version,
+                            "api_base": api_base,
+                        },
+                    )
+            elif custom_llm_provider == "azure_text":
+                # azure configs
+                api_type = get_secret("AZURE_API_TYPE") or "azure"
+    
+                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+    
+                api_version = (
+                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+                )
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.azure_key
+                    or get_secret("AZURE_OPENAI_API_KEY")
+                    or get_secret("AZURE_API_KEY")
+                )
+    
+                azure_ad_token = optional_params.get("extra_body", {}).pop(
+                    "azure_ad_token", None
+                ) or get_secret("AZURE_AD_TOKEN")
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.AzureOpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                response = azure_text_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    api_key=api_key,
+                    api_base=api_base,
+                    api_version=api_version,
+                    api_type=api_type,
+                    azure_ad_token=azure_ad_token,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    timeout=timeout,
+                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
+                )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={
+                            "headers": headers,
+                            "api_version": api_version,
+                            "api_base": api_base,
+                        },
+                    )
+            elif (
+                model in litellm.open_ai_chat_completion_models
+                or custom_llm_provider == "custom_openai"
+                or custom_llm_provider == "deepinfra"
+                or custom_llm_provider == "perplexity"
+                or custom_llm_provider == "groq"
+                or custom_llm_provider == "anyscale"
+                or custom_llm_provider == "mistral"
+                or custom_llm_provider == "openai"
+                or custom_llm_provider == "together_ai"
+                or custom_llm_provider in litellm.openai_compatible_providers
+                or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
+            ):  # allow user to make an openai call with a custom base
+                # note: if a user sets a custom base - we should ensure this works
+                # allow for the setting of dynamic and stateful api-bases
+                api_base = (
+                    api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+                    or litellm.api_base
+                    or get_secret("OPENAI_API_BASE")
+                    or "https://api.openai.com/v1"
+                )
+                openai.organization = (
+                    organization
+                    or litellm.organization
+                    or get_secret("OPENAI_ORGANIZATION")
+                    or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+                )
+                # set API KEY
+                api_key = (
+                    api_key
+                    or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                    or litellm.openai_key
+                    or get_secret("OPENAI_API_KEY")
+                )
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.OpenAIConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+    
+                ## COMPLETION CALL
+                try:
+                    response = openai_chat_completions.completion(
+                        model=model,
+                        messages=messages,
+                        headers=headers,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        api_key=api_key,
+                        api_base=api_base,
+                        acompletion=acompletion,
+                        logging_obj=logging,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        timeout=timeout,
+                        custom_prompt_dict=custom_prompt_dict,
+                        client=client,  # pass AsyncOpenAI, OpenAI client
+                        organization=organization,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                except Exception as e:
+                    ## LOGGING - log the original exception returned
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=str(e),
+                        additional_args={"headers": headers},
+                    )
+                    raise e
+    
+                if optional_params.get("stream", False):
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                        additional_args={"headers": headers},
+                    )
+            elif (
+                custom_llm_provider == "text-completion-openai"
+                or "ft:babbage-002" in model
+                or "ft:davinci-002" in model  # support for finetuned completion models
+            ):
+                openai.api_type = "openai"
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("OPENAI_API_BASE")
+                    or "https://api.openai.com/v1"
+                )
+    
+                openai.api_version = None
+                # set API KEY
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.openai_key
+                    or get_secret("OPENAI_API_KEY")
+                )
+    
+                headers = headers or litellm.headers
+    
+                ## LOAD CONFIG - if set
+                config = litellm.OpenAITextCompletionConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in optional_params
+                    ):  # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
+                        optional_params[k] = v
+                if litellm.organization:
+                    openai.organization = litellm.organization
+    
+                if (
+                    len(messages) > 0
+                    and "content" in messages[0]
+                    and type(messages[0]["content"]) == list
+                ):
+                    # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
+                    # https://platform.openai.com/docs/api-reference/completions/create
+                    prompt = messages[0]["content"]
+                else:
+                    prompt = " ".join([message["content"] for message in messages])  # type: ignore
+    
+                ## COMPLETION CALL
+                _response = openai_text_completions.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    api_key=api_key,
+                    api_base=api_base,
+                    acompletion=acompletion,
+                    client=client,  # pass AsyncOpenAI, OpenAI client
+                    logging_obj=logging,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    timeout=timeout,
+                )
+    
+                if (
+                    optional_params.get("stream", False) == False
+                    and acompletion == False
+                    and text_completion == False
+                ):
+                    # convert to chat completion response
+                    _response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
+                        response_object=_response, model_response_object=model_response
+                    )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=_response,
+                        additional_args={"headers": headers},
+                    )
+                response = _response
+            elif (
+                "replicate" in model
+                or custom_llm_provider == "replicate"
+                or model in litellm.replicate_models
+            ):
+                # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
+                replicate_key = None
+                replicate_key = (
+                    api_key
+                    or litellm.replicate_key
+                    or litellm.api_key
+                    or get_secret("REPLICATE_API_KEY")
+                    or get_secret("REPLICATE_API_TOKEN")
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("REPLICATE_API_BASE")
+                    or "https://api.replicate.com/v1"
+                )
+    
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+    
+                model_response = replicate.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,  # for calculating input/output tokens
+                    api_key=replicate_key,
+                    logging_obj=logging,
+                    custom_prompt_dict=custom_prompt_dict,
+                )
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    model_response = CustomStreamWrapper(model_response, model, logging_obj=logging, custom_llm_provider="replicate")  # type: ignore
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=replicate_key,
+                        original_response=model_response,
+                    )
+    
+                response = model_response
+    
+            elif custom_llm_provider == "anthropic":
+                api_key = (
+                    api_key
+                    or litellm.anthropic_key
+                    or litellm.api_key
+                    or os.environ.get("ANTHROPIC_API_KEY")
+                )
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+    
+                if (model == "claude-2") or (model == "claude-instant-1"):
+                    # call anthropic /completion, only use this route for claude-2, claude-instant-1
+                    api_base = (
+                        api_base
+                        or litellm.api_base
+                        or get_secret("ANTHROPIC_API_BASE")
+                        or "https://api.anthropic.com/v1/complete"
+                    )
+                    response = anthropic_text.completion(
+                        model=model,
+                        messages=messages,
+                        api_base=api_base,
+                        custom_prompt_dict=litellm.custom_prompt_dict,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        encoding=encoding,  # for calculating input/output tokens
+                        api_key=api_key,
+                        logging_obj=logging,
+                        headers=headers,
+                    )
+                else:
+                    # call /messages
+                    # default route for all anthropic models
+                    api_base = (
+                        api_base
+                        or litellm.api_base
+                        or get_secret("ANTHROPIC_API_BASE")
+                        or "https://api.anthropic.com/v1/messages"
+                    )
+                    response = anthropic_chat_completions.completion(
+                        model=model,
+                        messages=messages,
+                        api_base=api_base,
+                        acompletion=acompletion,
+                        custom_prompt_dict=litellm.custom_prompt_dict,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        encoding=encoding,  # for calculating input/output tokens
+                        api_key=api_key,
+                        logging_obj=logging,
+                        headers=headers,
+                    )
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                    )
+                response = response
+            elif custom_llm_provider == "nlp_cloud":
+                nlp_cloud_key = (
+                    api_key
+                    or litellm.nlp_cloud_key
+                    or get_secret("NLP_CLOUD_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("NLP_CLOUD_API_BASE")
+                    or "https://api.nlpcloud.io/v1/gpu/"
+                )
+    
+                response = nlp_cloud.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=nlp_cloud_key,
+                    logging_obj=logging,
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        response,
+                        model,
+                        custom_llm_provider="nlp_cloud",
+                        logging_obj=logging,
+                    )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                    )
+    
+                response = response
+            elif custom_llm_provider == "aleph_alpha":
+                aleph_alpha_key = (
+                    api_key
+                    or litellm.aleph_alpha_key
+                    or get_secret("ALEPH_ALPHA_API_KEY")
+                    or get_secret("ALEPHALPHA_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("ALEPH_ALPHA_API_BASE")
+                    or "https://api.aleph-alpha.com/complete"
+                )
+    
+                model_response = aleph_alpha.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    default_max_tokens_to_sample=litellm.max_tokens,
+                    api_key=aleph_alpha_key,
+                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="aleph_alpha",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "cohere":
+                cohere_key = (
+                    api_key
+                    or litellm.cohere_key
+                    or get_secret("COHERE_API_KEY")
+                    or get_secret("CO_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("COHERE_API_BASE")
+                    or "https://api.cohere.ai/v1/generate"
+                )
+    
+                model_response = cohere.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=cohere_key,
+                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="cohere",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "cohere_chat":
+                cohere_key = (
+                    api_key
+                    or litellm.cohere_key
+                    or get_secret("COHERE_API_KEY")
+                    or get_secret("CO_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("COHERE_API_BASE")
+                    or "https://api.cohere.ai/v1/chat"
+                )
+    
+                model_response = cohere_chat.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=cohere_key,
+                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="cohere_chat",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "maritalk":
+                maritalk_key = (
+                    api_key
+                    or litellm.maritalk_key
+                    or get_secret("MARITALK_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("MARITALK_API_BASE")
+                    or "https://chat.maritaca.ai/api/chat/inference"
+                )
+    
+                model_response = maritalk.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=maritalk_key,
+                    logging_obj=logging,
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="maritalk",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "huggingface":
+                custom_llm_provider = "huggingface"
+                huggingface_key = (
+                    api_key
+                    or litellm.huggingface_key
+                    or os.environ.get("HF_TOKEN")
+                    or os.environ.get("HUGGINGFACE_API_KEY")
+                    or litellm.api_key
+                )
+                hf_headers = headers or litellm.headers
+    
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+                model_response = huggingface.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,  # type: ignore
+                    headers=hf_headers,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=huggingface_key,
+                    acompletion=acompletion,
+                    logging_obj=logging,
+                    custom_prompt_dict=custom_prompt_dict,
+                    timeout=timeout,
+                )
+                if (
+                    "stream" in optional_params
+                    and optional_params["stream"] == True
+                    and acompletion is False
+                ):
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="huggingface",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "oobabooga":
+                custom_llm_provider = "oobabooga"
+                model_response = oobabooga.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    api_base=api_base,  # type: ignore
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    api_key=None,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                )
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="oobabooga",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "openrouter":
+                api_base = api_base or litellm.api_base or "https://openrouter.ai/api/v1"
+    
+                api_key = (
+                    api_key
+                    or litellm.api_key
+                    or litellm.openrouter_key
+                    or get_secret("OPENROUTER_API_KEY")
+                    or get_secret("OR_API_KEY")
+                )
+    
+                openrouter_site_url = get_secret("OR_SITE_URL") or "https://litellm.ai"
+    
+                openrouter_app_name = get_secret("OR_APP_NAME") or "liteLLM"
+    
+                headers = (
+                    headers
+                    or litellm.headers
+                    or {
+                        "HTTP-Referer": openrouter_site_url,
+                        "X-Title": openrouter_app_name,
+                    }
+                )
+    
+                ## Load Config
+                config = openrouter.OpenrouterConfig.get_config()
+                for k, v in config.items():
+                    if k == "extra_body":
+                        # we use openai 'extra_body' to pass openrouter specific params - transforms, route, models
+                        if "extra_body" in optional_params:
+                            optional_params[k].update(v)
+                        else:
+                            optional_params[k] = v
+                    elif k not in optional_params:
+                        optional_params[k] = v
+    
+                data = {"model": model, "messages": messages, **optional_params}
+    
+                ## COMPLETION CALL
+                response = openai_chat_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    api_key=api_key,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    timeout=timeout,
+                )
+                ## LOGGING
+                logging.post_call(
+                    input=messages, api_key=openai.api_key, original_response=response
+                )
+            elif (
+                custom_llm_provider == "together_ai"
+                or ("togethercomputer" in model)
+                or (model in litellm.together_ai_models)
+            ):
+                """
+                Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
+                """
+                custom_llm_provider = "together_ai"
+                together_ai_key = (
+                    api_key
+                    or litellm.togetherai_api_key
+                    or get_secret("TOGETHER_AI_TOKEN")
+                    or get_secret("TOGETHERAI_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("TOGETHERAI_API_BASE")
+                    or "https://api.together.xyz/inference"
+                )
+    
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+    
+                model_response = together_ai.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=together_ai_key,
+                    logging_obj=logging,
+                    custom_prompt_dict=custom_prompt_dict,
+                )
+                if (
+                    "stream_tokens" in optional_params
+                    and optional_params["stream_tokens"] == True
+                ):
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="together_ai",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "palm":
+                palm_api_key = api_key or get_secret("PALM_API_KEY") or litellm.api_key
+    
+                # palm does not support streaming as yet :(
+                model_response = palm.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=palm_api_key,
+                    logging_obj=logging,
+                )
+                # fake palm streaming
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # fake streaming for palm
+                    resp_string = model_response["choices"][0]["message"]["content"]
+                    response = CustomStreamWrapper(
+                        resp_string, model, custom_llm_provider="palm", logging_obj=logging
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "gemini":
+                gemini_api_key = (
+                    api_key
+                    or get_secret("GEMINI_API_KEY")
+                    or get_secret("PALM_API_KEY")  # older palm api key should also work
+                    or litellm.api_key
+                )
+    
+                # palm does not support streaming as yet :(
+                model_response = gemini.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=gemini_api_key,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    custom_prompt_dict=custom_prompt_dict,
+                )
+                if (
+                    "stream" in optional_params
+                    and optional_params["stream"] == True
+                    and acompletion == False
+                ):
+                    response = CustomStreamWrapper(
+                        iter(model_response),
+                        model,
+                        custom_llm_provider="gemini",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "vertex_ai":
+                vertex_ai_project = (
+                    optional_params.pop("vertex_project", None)
+                    or optional_params.pop("vertex_ai_project", None)
+                    or litellm.vertex_project
+                    or get_secret("VERTEXAI_PROJECT")
+                )
+                vertex_ai_location = (
+                    optional_params.pop("vertex_location", None)
+                    or optional_params.pop("vertex_ai_location", None)
+                    or litellm.vertex_location
+                    or get_secret("VERTEXAI_LOCATION")
+                )
+    
+                if "claude-3" in model:
+                    model_response = vertex_ai_anthropic.completion(
+                        model=model,
+                        messages=messages,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        encoding=encoding,
+                        vertex_location=vertex_ai_location,
+                        vertex_project=vertex_ai_project,
+                        logging_obj=logging,
+                        acompletion=acompletion,
+                    )
+                else:
+                    model_response = vertex_ai.completion(
+                        model=model,
+                        messages=messages,
+                        model_response=model_response,
+                        print_verbose=print_verbose,
+                        optional_params=optional_params,
+                        litellm_params=litellm_params,
+                        logger_fn=logger_fn,
+                        encoding=encoding,
+                        vertex_location=vertex_ai_location,
+                        vertex_project=vertex_ai_project,
+                        logging_obj=logging,
+                        acompletion=acompletion,
+                    )
+    
+                if (
+                    "stream" in optional_params
+                    and optional_params["stream"] == True
+                    and acompletion == False
+                ):
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="vertex_ai",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "ai21":
+                custom_llm_provider = "ai21"
+                ai21_key = (
+                    api_key
+                    or litellm.ai21_key
+                    or os.environ.get("AI21_API_KEY")
+                    or litellm.api_key
+                )
+    
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("AI21_API_BASE")
+                    or "https://api.ai21.com/studio/v1/"
+                )
+    
+                model_response = ai21.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=ai21_key,
+                    logging_obj=logging,
+                )
+    
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="ai21",
+                        logging_obj=logging,
+                    )
+                    return response
+    
+                ## RESPONSE OBJECT
+                response = model_response
+            elif custom_llm_provider == "sagemaker":
+                # boto3 reads keys from .env
+                model_response = sagemaker.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    custom_prompt_dict=custom_prompt_dict,
+                    hf_model_name=hf_model_name,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                )
+                if (
+                    "stream" in optional_params and optional_params["stream"] == True
+                ):  ## [BETA]
+                    print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
+                    from .llms.sagemaker import TokenIterator
+    
+                    tokenIterator = TokenIterator(model_response, acompletion=acompletion)
+                    response = CustomStreamWrapper(
+                        completion_stream=tokenIterator,
+                        model=model,
+                        custom_llm_provider="sagemaker",
+                        logging_obj=logging,
+                    )
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=None,
+                        original_response=response,
+                    )
+                    return response
+    
+                ## RESPONSE OBJECT
+                response = model_response
+            elif custom_llm_provider == "bedrock":
+                # boto3 reads keys from .env
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+                response = bedrock.completion(
+                    model=model,
+                    messages=messages,
+                    custom_prompt_dict=litellm.custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                    timeout=timeout,
+                )
+    
+                if (
+                    "stream" in optional_params
+                    and optional_params["stream"] == True
+                    and not isinstance(response, CustomStreamWrapper)
+                ):
+                    # don't try to access stream object,
+                    if "ai21" in model:
+                        response = CustomStreamWrapper(
+                            response,
+                            model,
+                            custom_llm_provider="bedrock",
+                            logging_obj=logging,
+                        )
+                    else:
+                        response = CustomStreamWrapper(
+                            iter(response),
+                            model,
+                            custom_llm_provider="bedrock",
+                            logging_obj=logging,
+                        )
+    
+                if optional_params.get("stream", False):
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=None,
+                        original_response=response,
+                    )
+    
+                ## RESPONSE OBJECT
+                response = response
+            elif custom_llm_provider == "vllm":
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+                model_response = vllm.completion(
+                    model=model,
+                    messages=messages,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                )
+    
+                if (
+                    "stream" in optional_params and optional_params["stream"] == True
+                ):  ## [BETA]
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="vllm",
+                        logging_obj=logging,
+                    )
+                    return response
+    
+                ## RESPONSE OBJECT
+                response = model_response
+            elif custom_llm_provider == "ollama":
+                api_base = (
+                    litellm.api_base
+                    or api_base
+                    or get_secret("OLLAMA_API_BASE")
+                    or "http://localhost:11434"
+                )
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+                if model in custom_prompt_dict:
+                    # check if the model has a registered custom prompt
+                    model_prompt_details = custom_prompt_dict[model]
+                    prompt = custom_prompt(
+                        role_dict=model_prompt_details["roles"],
+                        initial_prompt_value=model_prompt_details["initial_prompt_value"],
+                        final_prompt_value=model_prompt_details["final_prompt_value"],
+                        messages=messages,
+                    )
+                else:
+                    prompt = prompt_factory(
+                        model=model,
+                        messages=messages,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                    if isinstance(prompt, dict):
+                        # for multimode models - ollama/llava prompt_factory returns a dict {
+                        #     "prompt": prompt,
+                        #     "images": images
+                        # }
+                        prompt, images = prompt["prompt"], prompt["images"]
+                        optional_params["images"] = images
+    
+                ## LOGGING
+                generator = ollama.get_ollama_response(
+                    api_base,
+                    model,
+                    prompt,
+                    optional_params,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    model_response=model_response,
+                    encoding=encoding,
+                )
+                if acompletion is True or optional_params.get("stream", False) == True:
+                    return generator
+    
+                response = generator
+            elif custom_llm_provider == "ollama_chat":
+                api_base = (
+                    litellm.api_base
+                    or api_base
+                    or get_secret("OLLAMA_API_BASE")
+                    or "http://localhost:11434"
+                )
+    
+                ## LOGGING
+                generator = ollama_chat.get_ollama_response(
+                    api_base,
+                    model,
+                    messages,
+                    optional_params,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    model_response=model_response,
+                    encoding=encoding,
+                )
+                if acompletion is True or optional_params.get("stream", False) == True:
+                    return generator
+    
+                response = generator
+            elif custom_llm_provider == "cloudflare":
+                api_key = (
+                    api_key
+                    or litellm.cloudflare_api_key
+                    or litellm.api_key
+                    or get_secret("CLOUDFLARE_API_KEY")
+                )
+                account_id = get_secret("CLOUDFLARE_ACCOUNT_ID")
+                api_base = (
+                    api_base
+                    or litellm.api_base
+                    or get_secret("CLOUDFLARE_API_BASE")
+                    or f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/"
+                )
+    
+                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+                response = cloudflare.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    custom_prompt_dict=litellm.custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,  # for calculating input/output tokens
+                    api_key=api_key,
+                    logging_obj=logging,
+                )
+                if "stream" in optional_params and optional_params["stream"] == True:
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        response,
+                        model,
+                        custom_llm_provider="cloudflare",
+                        logging_obj=logging,
+                    )
+    
+                if optional_params.get("stream", False) or acompletion == True:
+                    ## LOGGING
+                    logging.post_call(
+                        input=messages,
+                        api_key=api_key,
+                        original_response=response,
+                    )
+                response = response
+            elif (
+                custom_llm_provider == "baseten"
+                or litellm.api_base == "https://app.baseten.co"
+            ):
+                custom_llm_provider = "baseten"
+                baseten_key = (
+                    api_key
+                    or litellm.baseten_key
+                    or os.environ.get("BASETEN_API_KEY")
+                    or litellm.api_key
+                )
+    
+                model_response = baseten.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    api_key=baseten_key,
+                    logging_obj=logging,
+                )
+                if inspect.isgenerator(model_response) or (
+                    "stream" in optional_params and optional_params["stream"] == True
+                ):
+                    # don't try to access stream object,
+                    response = CustomStreamWrapper(
+                        model_response,
+                        model,
+                        custom_llm_provider="baseten",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "petals" or model in litellm.petals_models:
+                api_base = api_base or litellm.api_base
+    
+                custom_llm_provider = "petals"
+                stream = optional_params.pop("stream", False)
+                model_response = petals.completion(
+                    model=model,
+                    messages=messages,
+                    api_base=api_base,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                )
+                if stream == True:  ## [BETA]
+                    # Fake streaming for petals
+                    resp_string = model_response["choices"][0]["message"]["content"]
+                    response = CustomStreamWrapper(
+                        resp_string,
+                        model,
+                        custom_llm_provider="petals",
+                        logging_obj=logging,
+                    )
+                    return response
+                response = model_response
+            elif custom_llm_provider == "custom":
+                import requests
+    
+                url = litellm.api_base or api_base or ""
+                if url == None or url == "":
+                    raise ValueError(
+                        "api_base not set. Set api_base or litellm.api_base for custom endpoints"
+                    )
+    
+                """
+                assume input to custom LLM api bases follow this format:
+                resp = requests.post(
+                    api_base,
+                    json={
+                        'model': 'meta-llama/Llama-2-13b-hf', # model name
+                        'params': {
+                            'prompt': ["The capital of France is P"],
+                            'max_tokens': 32,
+                            'temperature': 0.7,
+                            'top_p': 1.0,
+                            'top_k': 40,
+                        }
+                    }
+                )
+    
+                """
+                prompt = " ".join([message["content"] for message in messages])  # type: ignore
+                resp = requests.post(
+                    url,
+                    json={
+                        "model": model,
+                        "params": {
+                            "prompt": [prompt],
+                            "max_tokens": max_tokens,
+                            "temperature": temperature,
+                            "top_p": top_p,
+                            "top_k": kwargs.get("top_k", 40),
+                        },
+                    },
+                )
+                response_json = resp.json()
+                """
+                assume all responses from custom api_bases of this format:
+                {
+                    'data': [
+                        {
+                            'prompt': 'The capital of France is P',
+                            'output': ['The capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France'],
+                            'params': {'temperature': 0.7, 'top_k': 40, 'top_p': 1}}],
+                            'message': 'ok'
+                        }
+                    ]
+                }
+                """
+                string_response = response_json["data"][0]["output"][0]
+                ## RESPONSE OBJECT
+                model_response["choices"][0]["message"]["content"] = string_response
+                model_response["created"] = int(time.time())
+                model_response["model"] = model
+                response = model_response
+            else:
+                raise ValueError(
+                    f"Unable to map your input to a model. Check your input - {args}"
+                )
+            return response
+        except Exception as e:
+            ## Map to OpenAI Exception
+>           raise exception_type(
+                model=model,
+                custom_llm_provider=custom_llm_provider,
+                original_exception=e,
+                completion_kwargs=args,
+            )
+
+../main.py:2126: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+model = 'meta/llama-2-7b-chat'
+original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
+custom_llm_provider = 'replicate'
+completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
+
+    def exception_type(
+        model,
+        original_exception,
+        custom_llm_provider,
+        completion_kwargs={},
+    ):
+        global user_logger_fn, liteDebuggerClient
+        exception_mapping_worked = False
+        if litellm.suppress_debug_info is False:
+            print()  # noqa
+            print(  # noqa
+                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
+            )  # noqa
+            print(  # noqa
+                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
+            )  # noqa
+            print()  # noqa
+        try:
+            if model:
+                error_str = str(original_exception)
+                if isinstance(original_exception, BaseException):
+                    exception_type = type(original_exception).__name__
+                else:
+                    exception_type = ""
+    
+                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"APITimeoutError - Request timed out",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                    )
+    
+                if (
+                    custom_llm_provider == "openai"
+                    or custom_llm_provider == "text-completion-openai"
+                    or custom_llm_provider == "custom_openai"
+                    or custom_llm_provider in litellm.openai_compatible_providers
+                ):
+                    # custom_llm_provider is openai, make it OpenAI
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+                    if message is not None and isinstance(message, str):
+                        message = message.replace("OPENAI", custom_llm_provider.upper())
+                        message = message.replace("openai", custom_llm_provider)
+                        message = message.replace("OpenAI", custom_llm_provider)
+                    if custom_llm_provider == "openai":
+                        exception_provider = "OpenAI" + "Exception"
+                    else:
+                        exception_provider = (
+                            custom_llm_provider[0].upper()
+                            + custom_llm_provider[1:]
+                            + "Exception"
+                        )
+    
+                    if (
+                        "This model's maximum context length is" in error_str
+                        or "Request too large" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "model_not_found" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "content_policy_violation" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContentPolicyViolationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "Incorrect API key provided" not in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Mistral API raised a streaming error" in error_str:
+                        exception_mapping_worked = True
+                        _request = httpx.Request(
+                            method="POST", url="https://api.openai.com/v1"
+                        )
+                        raise APIError(
+                            status_code=500,
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=_request,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        exception_mapping_worked = True
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 404:
+                            exception_mapping_worked = True
+                            raise NotFoundError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        elif original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 503:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 504:  # gateway timeout error
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                request=original_exception.request,
+                            )
+                    else:
+                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                        raise APIConnectionError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=httpx.Request(
+                                method="POST", url="https://api.openai.com/v1/"
+                            ),
+                        )
+                elif custom_llm_provider == "anthropic":  # one of the anthropics
+                    if hasattr(original_exception, "message"):
+                        if (
+                            "prompt is too long" in original_exception.message
+                            or "prompt: length" in original_exception.message
+                        ):
+                            exception_mapping_worked = True
+                            raise ContextWindowExceededError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        if "Invalid API Key" in original_exception.message:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                    if hasattr(original_exception, "status_code"):
+                        print_verbose(f"status_code: {original_exception.status_code}")
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=(
+                                    original_exception.response
+                                    if hasattr(original_exception, "response")
+                                    else httpx.Response(
+                                        status_code=500,
+                                        request=httpx.Request(
+                                            method="POST",
+                                            url="https://docs.anthropic.com/claude/reference/messages_post",
+                                        ),
+                                    )
+                                ),
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
+                                llm_provider="anthropic",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "replicate":
+                    if "Incorrect authentication token" in error_str:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "input is too long" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif exception_type == "ModelError":
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif "Request was throttled" in error_str:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                llm_provider="replicate",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 422
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                model=model,
+                                llm_provider="replicate",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"ReplicateException - {original_exception.message}",
+                                model=model,
+                                llm_provider="replicate",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                llm_provider="replicate",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                llm_provider="replicate",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                    exception_mapping_worked = True
+                    raise APIError(
+                        status_code=500,
+                        message=f"ReplicateException - {str(original_exception)}",
+                        llm_provider="replicate",
+                        model=model,
+                        request=original_exception.request,
+                    )
+                elif custom_llm_provider == "bedrock":
+                    if (
+                        "too many tokens" in error_str
+                        or "expected maxLength:" in error_str
+                        or "Input is too long" in error_str
+                        or "prompt: length: 1.." in error_str
+                        or "Too many input tokens" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"BedrockException: Context Window Error - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                            response=original_exception.response,
+                        )
+                    if "Malformed input request" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"BedrockException - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                            response=original_exception.response,
+                        )
+                    if (
+                        "Unable to locate credentials" in error_str
+                        or "The security token included in the request is invalid"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"BedrockException Invalid Authentication - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                            response=original_exception.response,
+                        )
+                    if "AccessDeniedException" in error_str:
+                        exception_mapping_worked = True
+                        raise PermissionDeniedError(
+                            message=f"BedrockException PermissionDeniedError - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                            response=original_exception.response,
+                        )
+                    if (
+                        "throttlingException" in error_str
+                        or "ThrottlingException" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"BedrockException: Rate Limit Error - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                            response=original_exception.response,
+                        )
+                    if "Connect timeout on endpoint URL" in error_str:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"BedrockException: Timeout Error - {error_str}",
+                            model=model,
+                            llm_provider="bedrock",
+                        )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"BedrockException - {original_exception.message}",
+                                llm_provider="bedrock",
+                                model=model,
+                                response=httpx.Response(
+                                    status_code=500,
+                                    request=httpx.Request(
+                                        method="POST", url="https://api.openai.com/v1/"
+                                    ),
+                                ),
+                            )
+                        elif original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"BedrockException - {original_exception.message}",
+                                llm_provider="bedrock",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 400:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"BedrockException - {original_exception.message}",
+                                llm_provider="bedrock",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 404:
+                            exception_mapping_worked = True
+                            raise NotFoundError(
+                                message=f"BedrockException - {original_exception.message}",
+                                llm_provider="bedrock",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                elif custom_llm_provider == "sagemaker":
+                    if "Unable to locate credentials" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"SagemakerException - {error_str}",
+                            model=model,
+                            llm_provider="sagemaker",
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "Input validation error: `best_of` must be > 0 and <= 2"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints",
+                            model=model,
+                            llm_provider="sagemaker",
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "`inputs` tokens + `max_new_tokens` must be <=" in error_str
+                        or "instance type with more CPU capacity or memory" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"SagemakerException - {error_str}",
+                            model=model,
+                            llm_provider="sagemaker",
+                            response=original_exception.response,
+                        )
+                elif custom_llm_provider == "vertex_ai":
+                    if (
+                        "Vertex AI API has not been used in project" in error_str
+                        or "Unable to find your project" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"VertexAIException - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            response=original_exception.response,
+                        )
+                    elif "403" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"VertexAIException - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            response=original_exception.response,
+                        )
+                    elif "The response was blocked." in error_str:
+                        exception_mapping_worked = True
+                        raise UnprocessableEntityError(
+                            message=f"VertexAIException - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            response=httpx.Response(
+                                status_code=429,
+                                request=httpx.Request(
+                                    method="POST",
+                                    url=" https://cloud.google.com/vertex-ai/",
+                                ),
+                            ),
+                        )
+                    elif (
+                        "429 Quota exceeded" in error_str
+                        or "IndexError: list index out of range"
+                    ):
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"VertexAIException - {error_str}",
+                            model=model,
+                            llm_provider="vertex_ai",
+                            response=httpx.Response(
+                                status_code=429,
+                                request=httpx.Request(
+                                    method="POST",
+                                    url=" https://cloud.google.com/vertex-ai/",
+                                ),
+                            ),
+                        )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 400:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"VertexAIException - {error_str}",
+                                model=model,
+                                llm_provider="vertex_ai",
+                                response=original_exception.response,
+                            )
+                        if original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                message=f"VertexAIException - {error_str}",
+                                status_code=500,
+                                model=model,
+                                llm_provider="vertex_ai",
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
+                    if "503 Getting metadata" in error_str:
+                        # auth errors look like this
+                        # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"PalmException - Invalid api key",
+                            model=model,
+                            llm_provider="palm",
+                            response=original_exception.response,
+                        )
+                    if (
+                        "504 Deadline expired before operation could complete." in error_str
+                        or "504 Deadline Exceeded" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"PalmException - {original_exception.message}",
+                            model=model,
+                            llm_provider="palm",
+                        )
+                    if "400 Request payload size exceeds" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"PalmException - {error_str}",
+                            model=model,
+                            llm_provider="palm",
+                            response=original_exception.response,
+                        )
+                    if "500 An internal error has occurred." in error_str:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=getattr(original_exception, "status_code", 500),
+                            message=f"PalmException - {original_exception.message}",
+                            llm_provider="palm",
+                            model=model,
+                            request=original_exception.request,
+                        )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 400:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"PalmException - {error_str}",
+                                model=model,
+                                llm_provider="palm",
+                                response=original_exception.response,
+                            )
+                    # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
+                elif custom_llm_provider == "cloudflare":
+                    if "Authentication error" in error_str:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"Cloudflare Exception - {original_exception.message}",
+                            llm_provider="cloudflare",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    if "must have required property" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"Cloudflare Exception - {original_exception.message}",
+                            llm_provider="cloudflare",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                elif (
+                    custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat"
+                ):  # Cohere
+                    if (
+                        "invalid api token" in error_str
+                        or "No API key provided." in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "too many tokens" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"CohereException - {original_exception.message}",
+                            model=model,
+                            llm_provider="cohere",
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        if (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 498
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"CohereException - {original_exception.message}",
+                                llm_provider="cohere",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"CohereException - {original_exception.message}",
+                                llm_provider="cohere",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                    elif (
+                        "CohereConnectionError" in exception_type
+                    ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "invalid type:" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Unexpected server error" in error_str:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    else:
+                        if hasattr(original_exception, "status_code"):
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"CohereException - {original_exception.message}",
+                                llm_provider="cohere",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                        raise original_exception
+                elif custom_llm_provider == "huggingface":
+                    if "length limit exceeded" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=error_str,
+                            model=model,
+                            llm_provider="huggingface",
+                            response=original_exception.response,
+                        )
+                    elif "A valid user token is required" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=error_str,
+                            llm_provider="huggingface",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                llm_provider="huggingface",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 400:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                model=model,
+                                llm_provider="huggingface",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                model=model,
+                                llm_provider="huggingface",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                llm_provider="huggingface",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 503:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                llm_provider="huggingface",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"HuggingfaceException - {original_exception.message}",
+                                llm_provider="huggingface",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "ai21":
+                    if hasattr(original_exception, "message"):
+                        if "Prompt has too many tokens" in original_exception.message:
+                            exception_mapping_worked = True
+                            raise ContextWindowExceededError(
+                                message=f"AI21Exception - {original_exception.message}",
+                                model=model,
+                                llm_provider="ai21",
+                                response=original_exception.response,
+                            )
+                        if "Bad or missing API token." in original_exception.message:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AI21Exception - {original_exception.message}",
+                                model=model,
+                                llm_provider="ai21",
+                                response=original_exception.response,
+                            )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AI21Exception - {original_exception.message}",
+                                llm_provider="ai21",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AI21Exception - {original_exception.message}",
+                                model=model,
+                                llm_provider="ai21",
+                                request=original_exception.request,
+                            )
+                        if original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AI21Exception - {original_exception.message}",
+                                model=model,
+                                llm_provider="ai21",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AI21Exception - {original_exception.message}",
+                                llm_provider="ai21",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"AI21Exception - {original_exception.message}",
+                                llm_provider="ai21",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "nlp_cloud":
+                    if "detail" in error_str:
+                        if "Input text length should not exceed" in error_str:
+                            exception_mapping_worked = True
+                            raise ContextWindowExceededError(
+                                message=f"NLPCloudException - {error_str}",
+                                model=model,
+                                llm_provider="nlp_cloud",
+                                response=original_exception.response,
+                            )
+                        elif "value is not a valid" in error_str:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"NLPCloudException - {error_str}",
+                                model=model,
+                                llm_provider="nlp_cloud",
+                                response=original_exception.response,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=500,
+                                message=f"NLPCloudException - {error_str}",
+                                model=model,
+                                llm_provider="nlp_cloud",
+                                request=original_exception.request,
+                            )
+                    if hasattr(
+                        original_exception, "status_code"
+                    ):  # https://docs.nlpcloud.com/?shell#errors
+                        if (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 406
+                            or original_exception.status_code == 413
+                            or original_exception.status_code == 422
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"NLPCloudException - {original_exception.message}",
+                                llm_provider="nlp_cloud",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 401
+                            or original_exception.status_code == 403
+                        ):
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"NLPCloudException - {original_exception.message}",
+                                llm_provider="nlp_cloud",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 522
+                            or original_exception.status_code == 524
+                        ):
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"NLPCloudException - {original_exception.message}",
+                                model=model,
+                                llm_provider="nlp_cloud",
+                                request=original_exception.request,
+                            )
+                        elif (
+                            original_exception.status_code == 429
+                            or original_exception.status_code == 402
+                        ):
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"NLPCloudException - {original_exception.message}",
+                                llm_provider="nlp_cloud",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 500
+                            or original_exception.status_code == 503
+                        ):
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"NLPCloudException - {original_exception.message}",
+                                llm_provider="nlp_cloud",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                        elif (
+                            original_exception.status_code == 504
+                            or original_exception.status_code == 520
+                        ):
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"NLPCloudException - {original_exception.message}",
+                                model=model,
+                                llm_provider="nlp_cloud",
+                                response=original_exception.response,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"NLPCloudException - {original_exception.message}",
+                                llm_provider="nlp_cloud",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "together_ai":
+                    import json
+    
+                    try:
+                        error_response = json.loads(error_str)
+                    except:
+                        error_response = {"error": error_str}
+                    if (
+                        "error" in error_response
+                        and "`inputs` tokens + `max_new_tokens` must be <="
+                        in error_response["error"]
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "error" in error_response
+                        and "invalid private key" in error_response["error"]
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            llm_provider="together_ai",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "error" in error_response
+                        and "INVALID_ARGUMENT" in error_response["error"]
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response,
+                        )
+    
+                    elif (
+                        "error" in error_response
+                        and "API key doesn't match expected format."
+                        in error_response["error"]
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "error_type" in error_response
+                        and error_response["error_type"] == "validation"
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response,
+                        )
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"TogetherAIException - {original_exception.message}",
+                                model=model,
+                                llm_provider="together_ai",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"TogetherAIException - {error_response['error']}",
+                                model=model,
+                                llm_provider="together_ai",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"TogetherAIException - {original_exception.message}",
+                                llm_provider="together_ai",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 524:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"TogetherAIException - {original_exception.message}",
+                                llm_provider="together_ai",
+                                model=model,
+                            )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code,
+                            message=f"TogetherAIException - {original_exception.message}",
+                            llm_provider="together_ai",
+                            model=model,
+                            request=original_exception.request,
+                        )
+                elif custom_llm_provider == "aleph_alpha":
+                    if (
+                        "This is longer than the model's maximum context length"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "InvalidToken" in error_str or "No token provided" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AlephAlphaException - {original_exception.message}",
+                            llm_provider="aleph_alpha",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        print_verbose(f"status code: {original_exception.status_code}")
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AlephAlphaException - {original_exception.message}",
+                                llm_provider="aleph_alpha",
+                                model=model,
+                            )
+                        elif original_exception.status_code == 400:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AlephAlphaException - {original_exception.message}",
+                                llm_provider="aleph_alpha",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AlephAlphaException - {original_exception.message}",
+                                llm_provider="aleph_alpha",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"AlephAlphaException - {original_exception.message}",
+                                llm_provider="aleph_alpha",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        raise original_exception
+                    raise original_exception
+                elif (
+                    custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat"
+                ):
+                    if isinstance(original_exception, dict):
+                        error_str = original_exception.get("error", "")
+                    else:
+                        error_str = str(original_exception)
+                    if "no such file or directory" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
+                            model=model,
+                            llm_provider="ollama",
+                            response=original_exception.response,
+                        )
+                    elif "Failed to establish a new connection" in error_str:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"OllamaException: {original_exception}",
+                            llm_provider="ollama",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Invalid response object from API" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"OllamaException: {original_exception}",
+                            llm_provider="ollama",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Read timed out" in error_str:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"OllamaException: {original_exception}",
+                            llm_provider="ollama",
+                            model=model,
+                        )
+                elif custom_llm_provider == "vllm":
+                    if hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 0:
+                            exception_mapping_worked = True
+                            raise APIConnectionError(
+                                message=f"VLLMException - {original_exception.message}",
+                                llm_provider="vllm",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "azure":
+                    if "This model's maximum context length is" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"AzureException - {original_exception.message}",
+                            llm_provider="azure",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "DeploymentNotFound" in error_str:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"AzureException - {original_exception.message}",
+                            llm_provider="azure",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "content_policy_violation" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContentPolicyViolationError(
+                            message=f"AzureException - {original_exception.message}",
+                            llm_provider="azure",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "invalid_request_error" in error_str:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"AzureException - {original_exception.message}",
+                            llm_provider="azure",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "The api_key client option must be set either by passing api_key to the client or by setting"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"{exception_provider} - {original_exception.message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        exception_mapping_worked = True
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AzureException - {original_exception.message}",
+                                llm_provider="azure",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AzureException - {original_exception.message}",
+                                model=model,
+                                llm_provider="azure",
+                                request=original_exception.request,
+                            )
+                        if original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AzureException - {original_exception.message}",
+                                model=model,
+                                llm_provider="azure",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AzureException - {original_exception.message}",
+                                model=model,
+                                llm_provider="azure",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 503:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"AzureException - {original_exception.message}",
+                                model=model,
+                                llm_provider="azure",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 504:  # gateway timeout error
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AzureException - {original_exception.message}",
+                                model=model,
+                                llm_provider="azure",
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"AzureException - {original_exception.message}",
+                                llm_provider="azure",
+                                model=model,
+                                request=httpx.Request(
+                                    method="POST", url="https://openai.com/"
+                                ),
+                            )
+                    else:
+                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                        raise APIConnectionError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider="azure",
+                            model=model,
+                            request=httpx.Request(method="POST", url="https://openai.com/"),
+                        )
+            if (
+                "BadRequestError.__init__() missing 1 required positional argument: 'param'"
+                in str(original_exception)
+            ):  # deal with edge-case invalid request error bug in openai-python sdk
+                exception_mapping_worked = True
+                raise BadRequestError(
+                    message=f"{exception_provider}: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}",
+                    model=model,
+                    llm_provider=custom_llm_provider,
+                    response=original_exception.response,
+                )
+            else:  # ensure generic errors always return APIConnectionError=
+                exception_mapping_worked = True
+                if hasattr(original_exception, "request"):
+                    raise APIConnectionError(
+                        message=f"{str(original_exception)}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        request=original_exception.request,
+                    )
+                else:
+                    raise APIConnectionError(
+                        message=f"{str(original_exception)}",
+                        llm_provider=custom_llm_provider,
+                        model=model,
+                        request=httpx.Request(
+                            method="POST", url="https://api.openai.com/v1/"
+                        ),  # stub the request
+                    )
+        except Exception as e:
+            # LOGGING
+            exception_logging(
+                logger_fn=user_logger_fn,
+                additional_args={
+                    "exception_mapping_worked": exception_mapping_worked,
+                    "original_exception": original_exception,
+                },
+                exception=e,
+            )
+            ## AUTH ERROR
+            if isinstance(e, AuthenticationError) and (
+                litellm.email or "LITELLM_EMAIL" in os.environ
+            ):
+                threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
+            # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
+            if exception_mapping_worked:
+>               raise e
+
+../utils.py:8533: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+model = 'meta/llama-2-7b-chat'
+original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
+custom_llm_provider = 'replicate'
+completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
+
+    def exception_type(
+        model,
+        original_exception,
+        custom_llm_provider,
+        completion_kwargs={},
+    ):
+        global user_logger_fn, liteDebuggerClient
+        exception_mapping_worked = False
+        if litellm.suppress_debug_info is False:
+            print()  # noqa
+            print(  # noqa
+                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
+            )  # noqa
+            print(  # noqa
+                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
+            )  # noqa
+            print()  # noqa
+        try:
+            if model:
+                error_str = str(original_exception)
+                if isinstance(original_exception, BaseException):
+                    exception_type = type(original_exception).__name__
+                else:
+                    exception_type = ""
+    
+                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"APITimeoutError - Request timed out",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                    )
+    
+                if (
+                    custom_llm_provider == "openai"
+                    or custom_llm_provider == "text-completion-openai"
+                    or custom_llm_provider == "custom_openai"
+                    or custom_llm_provider in litellm.openai_compatible_providers
+                ):
+                    # custom_llm_provider is openai, make it OpenAI
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+                    if message is not None and isinstance(message, str):
+                        message = message.replace("OPENAI", custom_llm_provider.upper())
+                        message = message.replace("openai", custom_llm_provider)
+                        message = message.replace("OpenAI", custom_llm_provider)
+                    if custom_llm_provider == "openai":
+                        exception_provider = "OpenAI" + "Exception"
+                    else:
+                        exception_provider = (
+                            custom_llm_provider[0].upper()
+                            + custom_llm_provider[1:]
+                            + "Exception"
+                        )
+    
+                    if (
+                        "This model's maximum context length is" in error_str
+                        or "Request too large" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "model_not_found" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "content_policy_violation" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContentPolicyViolationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "Incorrect API key provided" not in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Mistral API raised a streaming error" in error_str:
+                        exception_mapping_worked = True
+                        _request = httpx.Request(
+                            method="POST", url="https://api.openai.com/v1"
+                        )
+                        raise APIError(
+                            status_code=500,
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=_request,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        exception_mapping_worked = True
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 404:
+                            exception_mapping_worked = True
+                            raise NotFoundError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        elif original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 503:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 504:  # gateway timeout error
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                request=original_exception.request,
+                            )
+                    else:
+                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                        raise APIConnectionError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=httpx.Request(
+                                method="POST", url="https://api.openai.com/v1/"
+                            ),
+                        )
+                elif custom_llm_provider == "anthropic":  # one of the anthropics
+                    if hasattr(original_exception, "message"):
+                        if (
+                            "prompt is too long" in original_exception.message
+                            or "prompt: length" in original_exception.message
+                        ):
+                            exception_mapping_worked = True
+                            raise ContextWindowExceededError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        if "Invalid API Key" in original_exception.message:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                    if hasattr(original_exception, "status_code"):
+                        print_verbose(f"status_code: {original_exception.status_code}")
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=(
+                                    original_exception.response
+                                    if hasattr(original_exception, "response")
+                                    else httpx.Response(
+                                        status_code=500,
+                                        request=httpx.Request(
+                                            method="POST",
+                                            url="https://docs.anthropic.com/claude/reference/messages_post",
+                                        ),
+                                    )
+                                ),
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
+                                llm_provider="anthropic",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "replicate":
+                    if "Incorrect authentication token" in error_str:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "input is too long" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif exception_type == "ModelError":
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif "Request was throttled" in error_str:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                llm_provider="replicate",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 422
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+>                           raise BadRequestError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                model=model,
+                                llm_provider="replicate",
+                                response=original_exception.response,
+E                               litellm.exceptions.BadRequestError: ReplicateException - Error: Traceback (most recent call last):
+E                                 2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                                 1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                                 0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                                 File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!, 
+E                               Replicate logs:MLC is currently not using any LoRAs.
+E                               MLC: True
+E                               Your formatted prompt is:
+E                               [INST] <<SYS>>
+E                               You are a helpful, respectful and honest assistant.
+E                               <</SYS>>
+E                               You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
+E                               Not using LoRA
+E                               Traceback (most recent call last):
+E                               File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
+E                               for r in result:
+E                               File "/src/predict.py", line 198, in predict
+E                               for decoded_token in self.engine(
+E                               File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
+E                               for val in gen:
+E                               File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
+E                               self.cm.reset_chat(chat_config)
+E                               File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
+E                               self._load_json_override_func(user_chat_config_json_str, True)
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
+E                               File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
+E                               File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
+E                               raise py_err
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                               tvm._ffi.base.TVMError: Traceback (most recent call last):
+E                               2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                               1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                               0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!
+
+../utils.py:7547: BadRequestError
+
+During handling of the above exception, another exception occurred:
+
+    def test_replicate_custom_prompt_dict():
+        litellm.set_verbose = True
+        model_name = "replicate/meta/llama-2-7b-chat"
+        litellm.register_prompt_template(
+            model="replicate/meta/llama-2-7b-chat",
+            initial_prompt_value="You are a good assistant",  # [OPTIONAL]
+            roles={
+                "system": {
+                    "pre_message": "[INST] <<SYS>>\n",  # [OPTIONAL]
+                    "post_message": "\n<</SYS>>\n [/INST]\n",  # [OPTIONAL]
+                },
+                "user": {
+                    "pre_message": "[INST] ",  # [OPTIONAL]
+                    "post_message": " [/INST]",  # [OPTIONAL]
+                },
+                "assistant": {
+                    "pre_message": "\n",  # [OPTIONAL]
+                    "post_message": "\n",  # [OPTIONAL]
+                },
+            },
+            final_prompt_value="Now answer as best you can:",  # [OPTIONAL]
+        )
+>       response = completion(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": "what is yc write 1 paragraph",
+                }
+            ],
+            repetition_penalty=0.1,
+            num_retries=3,
+        )
+
+test_completion.py:1655: 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+../utils.py:2920: in wrapper
+    return litellm.completion_with_retries(*args, **kwargs)
+../main.py:2158: in completion_with_retries
+    return retryer(original_function, *args, **kwargs)
+/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:379: in __call__
+    do = self.iter(retry_state=retry_state)
+/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:325: in iter
+    raise retry_exc.reraise()
+/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:158: in reraise
+    raise self.last_attempt.result()
+/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py:449: in result
+    return self.__get_result()
+/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py:401: in __get_result
+    raise self._exception
+/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:382: in __call__
+    result = fn(*args, **kwargs)
+../utils.py:2948: in wrapper
+    raise e
+../utils.py:2846: in wrapper
+    result = original_function(*args, **kwargs)
+../main.py:2126: in completion
+    raise exception_type(
+../utils.py:8533: in exception_type
+    raise e
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+model = 'meta/llama-2-7b-chat'
+original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
+custom_llm_provider = 'replicate'
+completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
+
+    def exception_type(
+        model,
+        original_exception,
+        custom_llm_provider,
+        completion_kwargs={},
+    ):
+        global user_logger_fn, liteDebuggerClient
+        exception_mapping_worked = False
+        if litellm.suppress_debug_info is False:
+            print()  # noqa
+            print(  # noqa
+                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
+            )  # noqa
+            print(  # noqa
+                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
+            )  # noqa
+            print()  # noqa
+        try:
+            if model:
+                error_str = str(original_exception)
+                if isinstance(original_exception, BaseException):
+                    exception_type = type(original_exception).__name__
+                else:
+                    exception_type = ""
+    
+                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
+                    exception_mapping_worked = True
+                    raise Timeout(
+                        message=f"APITimeoutError - Request timed out",
+                        model=model,
+                        llm_provider=custom_llm_provider,
+                    )
+    
+                if (
+                    custom_llm_provider == "openai"
+                    or custom_llm_provider == "text-completion-openai"
+                    or custom_llm_provider == "custom_openai"
+                    or custom_llm_provider in litellm.openai_compatible_providers
+                ):
+                    # custom_llm_provider is openai, make it OpenAI
+                    if hasattr(original_exception, "message"):
+                        message = original_exception.message
+                    else:
+                        message = str(original_exception)
+                    if message is not None and isinstance(message, str):
+                        message = message.replace("OPENAI", custom_llm_provider.upper())
+                        message = message.replace("openai", custom_llm_provider)
+                        message = message.replace("OpenAI", custom_llm_provider)
+                    if custom_llm_provider == "openai":
+                        exception_provider = "OpenAI" + "Exception"
+                    else:
+                        exception_provider = (
+                            custom_llm_provider[0].upper()
+                            + custom_llm_provider[1:]
+                            + "Exception"
+                        )
+    
+                    if (
+                        "This model's maximum context length is" in error_str
+                        or "Request too large" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "model_not_found" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "content_policy_violation" in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise ContentPolicyViolationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "invalid_request_error" in error_str
+                        and "Incorrect API key provided" not in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif (
+                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
+                        in error_str
+                    ):
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "Mistral API raised a streaming error" in error_str:
+                        exception_mapping_worked = True
+                        _request = httpx.Request(
+                            method="POST", url="https://api.openai.com/v1"
+                        )
+                        raise APIError(
+                            status_code=500,
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=_request,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        exception_mapping_worked = True
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 404:
+                            exception_mapping_worked = True
+                            raise NotFoundError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        elif original_exception.status_code == 422:
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 503:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 504:  # gateway timeout error
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"{exception_provider} - {message}",
+                                model=model,
+                                llm_provider=custom_llm_provider,
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"{exception_provider} - {message}",
+                                llm_provider=custom_llm_provider,
+                                model=model,
+                                request=original_exception.request,
+                            )
+                    else:
+                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
+                        raise APIConnectionError(
+                            message=f"{exception_provider} - {message}",
+                            llm_provider=custom_llm_provider,
+                            model=model,
+                            request=httpx.Request(
+                                method="POST", url="https://api.openai.com/v1/"
+                            ),
+                        )
+                elif custom_llm_provider == "anthropic":  # one of the anthropics
+                    if hasattr(original_exception, "message"):
+                        if (
+                            "prompt is too long" in original_exception.message
+                            or "prompt: length" in original_exception.message
+                        ):
+                            exception_mapping_worked = True
+                            raise ContextWindowExceededError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        if "Invalid API Key" in original_exception.message:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=original_exception.message,
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                    if hasattr(original_exception, "status_code"):
+                        print_verbose(f"status_code: {original_exception.status_code}")
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+                            raise BadRequestError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 408:
+                            exception_mapping_worked = True
+                            raise Timeout(
+                                message=f"AnthropicException - {original_exception.message}",
+                                model=model,
+                                llm_provider="anthropic",
+                                request=original_exception.request,
+                            )
+                        elif original_exception.status_code == 429:
+                            exception_mapping_worked = True
+                            raise RateLimitError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif original_exception.status_code == 500:
+                            exception_mapping_worked = True
+                            raise ServiceUnavailableError(
+                                message=f"AnthropicException - {original_exception.message}",
+                                llm_provider="anthropic",
+                                model=model,
+                                response=(
+                                    original_exception.response
+                                    if hasattr(original_exception, "response")
+                                    else httpx.Response(
+                                        status_code=500,
+                                        request=httpx.Request(
+                                            method="POST",
+                                            url="https://docs.anthropic.com/claude/reference/messages_post",
+                                        ),
+                                    )
+                                ),
+                            )
+                        else:
+                            exception_mapping_worked = True
+                            raise APIError(
+                                status_code=original_exception.status_code,
+                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
+                                llm_provider="anthropic",
+                                model=model,
+                                request=original_exception.request,
+                            )
+                elif custom_llm_provider == "replicate":
+                    if "Incorrect authentication token" in error_str:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif "input is too long" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif exception_type == "ModelError":
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"ReplicateException - {error_str}",
+                            model=model,
+                            llm_provider="replicate",
+                            response=original_exception.response,
+                        )
+                    elif "Request was throttled" in error_str:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"ReplicateException - {error_str}",
+                            llm_provider="replicate",
+                            model=model,
+                            response=original_exception.response,
+                        )
+                    elif hasattr(original_exception, "status_code"):
+                        if original_exception.status_code == 401:
+                            exception_mapping_worked = True
+                            raise AuthenticationError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                llm_provider="replicate",
+                                model=model,
+                                response=original_exception.response,
+                            )
+                        elif (
+                            original_exception.status_code == 400
+                            or original_exception.status_code == 422
+                            or original_exception.status_code == 413
+                        ):
+                            exception_mapping_worked = True
+>                           raise BadRequestError(
+                                message=f"ReplicateException - {original_exception.message}",
+                                model=model,
+                                llm_provider="replicate",
+                                response=original_exception.response,
+E                               litellm.exceptions.BadRequestError: ReplicateException - Error: Traceback (most recent call last):
+E                                 2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                                 1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                                 0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                                 File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!, 
+E                               Replicate logs:MLC is currently not using any LoRAs.
+E                               MLC: True
+E                               Your formatted prompt is:
+E                               [INST] <<SYS>>
+E                               You are a helpful, respectful and honest assistant.
+E                               <</SYS>>
+E                               You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
+E                               Not using LoRA
+E                               Traceback (most recent call last):
+E                               File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
+E                               for r in result:
+E                               File "/src/predict.py", line 198, in predict
+E                               for decoded_token in self.engine(
+E                               File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
+E                               for val in gen:
+E                               File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
+E                               self.cm.reset_chat(chat_config)
+E                               File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
+E                               self._load_json_override_func(user_chat_config_json_str, True)
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
+E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
+E                               File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
+E                               File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
+E                               raise py_err
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                               tvm._ffi.base.TVMError: Traceback (most recent call last):
+E                               2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:1545
+E                               1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:483
+E                               0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
+E                               at /workspace/mlc-llm/cpp/llm_chat.cc:387
+E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
+E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!
+
+../utils.py:7547: BadRequestError
+---------------------------- Captured stdout setup -----------------------------
+<module 'litellm' from '/Users/krrishdholakia/Documents/litellm/litellm/__init__.py'>
+
+pytest fixture - resetting callbacks
+----------------------------- Captured stdout call -----------------------------
+
+
+[92mRequest to litellm:[0m
+[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, num_retries=3)[0m
+
+
+self.optional_params: {}
+kwargs[caching]: False; litellm.cache: None
+Final returned optional params: {'repetition_penalty': 0.1}
+self.optional_params: {'repetition_penalty': 0.1}
+[92m
+
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
+-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
+-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
+[0m
+
+https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg
+replicate: polling endpoint: https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg
+Non-streamed output:
+
+[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
+LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.
+
+Logging Details: logger_fn - None | callable(logger_fn) - False
+
+
+[92mRequest to litellm:[0m
+[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m
+
+
+kwargs[caching]: False; litellm.cache: None
+Final returned optional params: {'repetition_penalty': 0.1}
+self.optional_params: {'repetition_penalty': 0.1}
+[92m
+
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
+-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
+-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
+[0m
+
+https://api.replicate.com/v1/predictions/5a7rh5dx6xrgm0cers4t4gad2m
+replicate: polling endpoint: https://api.replicate.com/v1/predictions/5a7rh5dx6xrgm0cers4t4gad2m
+Non-streamed output:
+
+[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
+LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.
+
+Logging Details: logger_fn - None | callable(logger_fn) - False
+Logging Details LiteLLM-Failure Call
+self.failure_callback: []
+
+
+[92mRequest to litellm:[0m
+[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m
+
+
+kwargs[caching]: False; litellm.cache: None
+Final returned optional params: {'repetition_penalty': 0.1}
+self.optional_params: {'repetition_penalty': 0.1}
+[92m
+
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
+-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
+-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
+[0m
+
+https://api.replicate.com/v1/predictions/fdx5mgp0tnrgj0cers4r6taf9c
+replicate: polling endpoint: https://api.replicate.com/v1/predictions/fdx5mgp0tnrgj0cers4r6taf9c
+Non-streamed output:
+
+[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
+LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.
+
+Logging Details: logger_fn - None | callable(logger_fn) - False
+Logging Details LiteLLM-Failure Call
+self.failure_callback: []
+
+
+[92mRequest to litellm:[0m
+[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m
+
+
+kwargs[caching]: False; litellm.cache: None
+Final returned optional params: {'repetition_penalty': 0.1}
+self.optional_params: {'repetition_penalty': 0.1}
+[92m
+
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
+-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
+-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
+[0m
+
+https://api.replicate.com/v1/predictions/1772b6y4qxrgp0cers4s0adhpr
+replicate: polling endpoint: https://api.replicate.com/v1/predictions/1772b6y4qxrgp0cers4s0adhpr
+Non-streamed output:
+
+[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
+LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.
+
+Logging Details: logger_fn - None | callable(logger_fn) - False
+Logging Details LiteLLM-Failure Call
+self.failure_callback: []
+=============================== warnings summary ===============================
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: 18 warnings
+  /opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)
+
+../proxy/_types.py:167
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:167: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:254
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:254: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    extra = Extra.allow  # Allow extra fields
+
+../proxy/_types.py:257
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:257: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:286
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:286: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:333
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:333: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:399
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:399: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:411
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:411: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:451
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:451: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:477
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:477: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:740
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:740: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:763
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:763: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../proxy/_types.py:782
+  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:782: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+    @root_validator(pre=True)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:121
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:121: DeprecationWarning: pkg_resources is deprecated as an API
+    warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: 10 warnings
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.cloud')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(parent)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.logging')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.iam')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('mpl_toolkits')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('sphinxcontrib')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
+  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('zope')`.
+  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+    declare_namespace(pkg)
+
+../llms/prompt_templates/factory.py:6
+  /Users/krrishdholakia/Documents/litellm/litellm/llms/prompt_templates/factory.py:6: DeprecationWarning: 'imghdr' is deprecated and slated for removal in Python 3.13
+    import imghdr, base64
+
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+=========================== short test summary info ============================
+FAILED test_completion.py::test_replicate_custom_prompt_dict - litellm.except...
+======================== 1 failed, 56 warnings in 4.33s ========================
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 8ea9ca760..abfb238e0 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -1632,9 +1632,9 @@ def test_completion_replicate_vicuna():
 
 def test_replicate_custom_prompt_dict():
     litellm.set_verbose = True
-    model_name = "replicate/meta/llama-2-7b-chat"
+    model_name = "replicate/meta/llama-2-70b-chat"
     litellm.register_prompt_template(
-        model="replicate/meta/llama-2-7b-chat",
+        model="replicate/meta/llama-2-70b-chat",
         initial_prompt_value="You are a good assistant",  # [OPTIONAL]
         roles={
             "system": {
@@ -1660,6 +1660,7 @@ def test_replicate_custom_prompt_dict():
                 "content": "what is yc write 1 paragraph",
             }
         ],
+        repetition_penalty=0.1,
         num_retries=3,
     )
     print(f"response: {response}")