litellm-mirror/litellm/tests/log.txt

============================= test session starts ==============================
platform darwin -- Python 3.11.6, pytest-7.3.1, pluggy-1.3.0
rootdir: /Users/krrishdholakia/Documents/litellm/litellm/tests
plugins: timeout-2.2.0, asyncio-0.23.2, anyio-3.7.1, xdist-3.3.1
asyncio: mode=Mode.STRICT
collected 1 item

test_completion.py F                                                     [100%]

=================================== FAILURES ===================================
______________________ test_replicate_custom_prompt_dict _______________________

model = 'meta/llama-2-7b-chat'
messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
timeout = 600.0, temperature = None, top_p = None, n = None, stream = None
stop = None, max_tokens = None, presence_penalty = None
frequency_penalty = None, logit_bias = None, user = None, response_format = None
seed = None, tools = None, tool_choice = None, logprobs = None
top_logprobs = None, deployment_id = None, extra_headers = None
functions = None, function_call = None, base_url = None, api_version = None
api_key = None, model_list = None
kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x1043e1550>, 'num_retries': 3, 'repetition_penalty': 0.1}
args = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
api_base = 'https://api.replicate.com/v1', mock_response = None
force_timeout = 600, logger_fn = None, verbose = False
custom_llm_provider = 'replicate'

    @client
    def completion(
        model: str,
        # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
        messages: List = [],
        timeout: Optional[Union[float, int]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        n: Optional[int] = None,
        stream: Optional[bool] = None,
        stop=None,
        max_tokens: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[dict] = None,
        user: Optional[str] = None,
        # openai v1.0+ new params
        response_format: Optional[dict] = None,
        seed: Optional[int] = None,
        tools: Optional[List] = None,
        tool_choice: Optional[str] = None,
        logprobs: Optional[bool] = None,
        top_logprobs: Optional[int] = None,
        deployment_id=None,
        extra_headers: Optional[dict] = None,
        # soon to be deprecated params by OpenAI
        functions: Optional[List] = None,
        function_call: Optional[str] = None,
        # set api_base, api_version, api_key
        base_url: Optional[str] = None,
        api_version: Optional[str] = None,
        api_key: Optional[str] = None,
        model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
        # Optional liteLLM function params
        **kwargs,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
        """
        Perform a completion() using any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)
        Parameters:
            model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
            messages (List): A list of message objects representing the conversation context (default is an empty list).

            OPTIONAL PARAMS
            functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
            function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
            temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
            top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
            n (int, optional): The number of completions to generate (default is 1).
            stream (bool, optional): If True, return a streaming response (default is False).
            stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
            max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
            presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
            frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
            logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
            user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
            logprobs (bool, optional): Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message
            top_logprobs (int, optional): An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
            metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc.
            api_base (str, optional): Base URL for the API (default is None).
            api_version (str, optional): API version (default is None).
            api_key (str, optional): API key (default is None).
            model_list (list, optional): List of api base, version, keys
            extra_headers (dict, optional): Additional headers to include in the request.

            LITELLM Specific Params
            mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
            custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
            max_retries (int, optional): The number of retries to attempt (default is 0).
        Returns:
            ModelResponse: A response object containing the generated completion and associated metadata.

        Note:
            - This function is used to perform completions() using the specified language model.
            - It supports various optional parameters for customizing the completion behavior.
            - If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
        """
        ######### unpacking kwargs #####################
        args = locals()
        api_base = kwargs.get("api_base", None)
        mock_response = kwargs.get("mock_response", None)
        force_timeout = kwargs.get("force_timeout", 600)  ## deprecated
        logger_fn = kwargs.get("logger_fn", None)
        verbose = kwargs.get("verbose", False)
        custom_llm_provider = kwargs.get("custom_llm_provider", None)
        litellm_logging_obj = kwargs.get("litellm_logging_obj", None)
        id = kwargs.get("id", None)
        metadata = kwargs.get("metadata", None)
        model_info = kwargs.get("model_info", None)
        proxy_server_request = kwargs.get("proxy_server_request", None)
        fallbacks = kwargs.get("fallbacks", None)
        headers = kwargs.get("headers", None)
        num_retries = kwargs.get("num_retries", None)  ## deprecated
        max_retries = kwargs.get("max_retries", None)
        context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
        organization = kwargs.get("organization", None)
        ### CUSTOM MODEL COST ###
        input_cost_per_token = kwargs.get("input_cost_per_token", None)
        output_cost_per_token = kwargs.get("output_cost_per_token", None)
        input_cost_per_second = kwargs.get("input_cost_per_second", None)
        output_cost_per_second = kwargs.get("output_cost_per_second", None)
        ### CUSTOM PROMPT TEMPLATE ###
        initial_prompt_value = kwargs.get("initial_prompt_value", None)
        roles = kwargs.get("roles", None)
        final_prompt_value = kwargs.get("final_prompt_value", None)
        bos_token = kwargs.get("bos_token", None)
        eos_token = kwargs.get("eos_token", None)
        preset_cache_key = kwargs.get("preset_cache_key", None)
        hf_model_name = kwargs.get("hf_model_name", None)
        ### TEXT COMPLETION CALLS ###
        text_completion = kwargs.get("text_completion", False)
        atext_completion = kwargs.get("atext_completion", False)
        ### ASYNC CALLS ###
        acompletion = kwargs.get("acompletion", False)
        client = kwargs.get("client", None)
        ### Admin Controls ###
        no_log = kwargs.get("no-log", False)
        ######## end of unpacking kwargs ###########
        openai_params = [
            "functions",
            "function_call",
            "temperature",
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "request_timeout",
            "api_base",
            "api_version",
            "api_key",
            "deployment_id",
            "organization",
            "base_url",
            "default_headers",
            "timeout",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
            "logprobs",
            "top_logprobs",
            "extra_headers",
        ]
        litellm_params = [
            "metadata",
            "acompletion",
            "atext_completion",
            "text_completion",
            "caching",
            "mock_response",
            "api_key",
            "api_version",
            "api_base",
            "force_timeout",
            "logger_fn",
            "verbose",
            "custom_llm_provider",
            "litellm_logging_obj",
            "litellm_call_id",
            "use_client",
            "id",
            "fallbacks",
            "azure",
            "headers",
            "model_list",
            "num_retries",
            "context_window_fallback_dict",
            "roles",
            "final_prompt_value",
            "bos_token",
            "eos_token",
            "request_timeout",
            "complete_response",
            "self",
            "client",
            "rpm",
            "tpm",
            "input_cost_per_token",
            "output_cost_per_token",
            "input_cost_per_second",
            "output_cost_per_second",
            "hf_model_name",
            "model_info",
            "proxy_server_request",
            "preset_cache_key",
            "caching_groups",
            "ttl",
            "cache",
            "no-log",
            "base_model",
            "stream_timeout",
        ]
        default_params = openai_params + litellm_params
        non_default_params = {
            k: v for k, v in kwargs.items() if k not in default_params
        }  # model-specific params - pass them straight to the model/provider
        if timeout is None:
            timeout = (
                kwargs.get("request_timeout", None) or 600
            )  # set timeout for 10 minutes by default
        timeout = float(timeout)
        try:
            if base_url is not None:
                api_base = base_url
            if max_retries is not None:  # openai allows openai.OpenAI(max_retries=3)
                num_retries = max_retries
            logging = litellm_logging_obj
            fallbacks = fallbacks or litellm.model_fallbacks
            if fallbacks is not None:
                return completion_with_fallbacks(**args)
            if model_list is not None:
                deployments = [
                    m["litellm_params"] for m in model_list if m["model_name"] == model
                ]
                return batch_completion_models(deployments=deployments, **args)
            if litellm.model_alias_map and model in litellm.model_alias_map:
                model = litellm.model_alias_map[
                    model
                ]  # update the model to the actual value if an alias has been passed in
            model_response = ModelResponse()
            if (
                kwargs.get("azure", False) == True
            ):  # don't remove flag check, to remain backwards compatible for repos like Codium
                custom_llm_provider = "azure"
            if deployment_id != None:  # azure llms
                model = deployment_id
                custom_llm_provider = "azure"
            model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(
                model=model,
                custom_llm_provider=custom_llm_provider,
                api_base=api_base,
                api_key=api_key,
            )
            if model_response is not None and hasattr(model_response, "_hidden_params"):
                model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
                model_response._hidden_params["region_name"] = kwargs.get(
                    "aws_region_name", None
                )  # support region-based pricing for bedrock

            ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
            if input_cost_per_token is not None and output_cost_per_token is not None:
                print_verbose(f"Registering model={model} in model cost map")
                litellm.register_model(
                    {
                        f"{custom_llm_provider}/{model}": {
                            "input_cost_per_token": input_cost_per_token,
                            "output_cost_per_token": output_cost_per_token,
                            "litellm_provider": custom_llm_provider,
                        },
                        model: {
                            "input_cost_per_token": input_cost_per_token,
                            "output_cost_per_token": output_cost_per_token,
                            "litellm_provider": custom_llm_provider,
                        },
                    }
                )
            elif (
                input_cost_per_second is not None
            ):  # time based pricing just needs cost in place
                output_cost_per_second = output_cost_per_second
                litellm.register_model(
                    {
                        f"{custom_llm_provider}/{model}": {
                            "input_cost_per_second": input_cost_per_second,
                            "output_cost_per_second": output_cost_per_second,
                            "litellm_provider": custom_llm_provider,
                        },
                        model: {
                            "input_cost_per_second": input_cost_per_second,
                            "output_cost_per_second": output_cost_per_second,
                            "litellm_provider": custom_llm_provider,
                        },
                    }
                )
            ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
            custom_prompt_dict = {}  # type: ignore
            if (
                initial_prompt_value
                or roles
                or final_prompt_value
                or bos_token
                or eos_token
            ):
                custom_prompt_dict = {model: {}}
                if initial_prompt_value:
                    custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
                if roles:
                    custom_prompt_dict[model]["roles"] = roles
                if final_prompt_value:
                    custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
                if bos_token:
                    custom_prompt_dict[model]["bos_token"] = bos_token
                if eos_token:
                    custom_prompt_dict[model]["eos_token"] = eos_token
            model_api_key = get_api_key(
                llm_provider=custom_llm_provider, dynamic_api_key=api_key
            )  # get the api key from the environment if required for the model

            if dynamic_api_key is not None:
                api_key = dynamic_api_key
            # check if user passed in any of the OpenAI optional params
            optional_params = get_optional_params(
                functions=functions,
                function_call=function_call,
                temperature=temperature,
                top_p=top_p,
                n=n,
                stream=stream,
                stop=stop,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                logit_bias=logit_bias,
                user=user,
                # params to identify the model
                model=model,
                custom_llm_provider=custom_llm_provider,
                response_format=response_format,
                seed=seed,
                tools=tools,
                tool_choice=tool_choice,
                max_retries=max_retries,
                logprobs=logprobs,
                top_logprobs=top_logprobs,
                extra_headers=extra_headers,
                **non_default_params,
            )

            if litellm.add_function_to_prompt and optional_params.get(
                "functions_unsupported_model", None
            ):  # if user opts to add it to prompt, when API doesn't support function calling
                functions_unsupported_model = optional_params.pop(
                    "functions_unsupported_model"
                )
                messages = function_call_prompt(
                    messages=messages, functions=functions_unsupported_model
                )

            # For logging - save the values of the litellm-specific params passed in
            litellm_params = get_litellm_params(
                acompletion=acompletion,
                api_key=api_key,
                force_timeout=force_timeout,
                logger_fn=logger_fn,
                verbose=verbose,
                custom_llm_provider=custom_llm_provider,
                api_base=api_base,
                litellm_call_id=kwargs.get("litellm_call_id", None),
                model_alias_map=litellm.model_alias_map,
                completion_call_id=id,
                metadata=metadata,
                model_info=model_info,
                proxy_server_request=proxy_server_request,
                preset_cache_key=preset_cache_key,
                no_log=no_log,
            )
            logging.update_environment_variables(
                model=model,
                user=user,
                optional_params=optional_params,
                litellm_params=litellm_params,
            )
            if mock_response:
                return mock_completion(
                    model,
                    messages,
                    stream=stream,
                    mock_response=mock_response,
                    logging=logging,
                    acompletion=acompletion,
                )
            if custom_llm_provider == "azure":
                # azure configs
                api_type = get_secret("AZURE_API_TYPE") or "azure"

                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")

                api_version = (
                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
                )

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.azure_key
                    or get_secret("AZURE_OPENAI_API_KEY")
                    or get_secret("AZURE_API_KEY")
                )

                azure_ad_token = optional_params.get("extra_body", {}).pop(
                    "azure_ad_token", None
                ) or get_secret("AZURE_AD_TOKEN")

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.AzureOpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                response = azure_chat_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    api_type=api_type,
                    azure_ad_token=azure_ad_token,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    logging_obj=logging,
                    acompletion=acompletion,
                    timeout=timeout,
                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
                )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={
                            "headers": headers,
                            "api_version": api_version,
                            "api_base": api_base,
                        },
                    )
            elif custom_llm_provider == "azure_text":
                # azure configs
                api_type = get_secret("AZURE_API_TYPE") or "azure"

                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")

                api_version = (
                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
                )

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.azure_key
                    or get_secret("AZURE_OPENAI_API_KEY")
                    or get_secret("AZURE_API_KEY")
                )

                azure_ad_token = optional_params.get("extra_body", {}).pop(
                    "azure_ad_token", None
                ) or get_secret("AZURE_AD_TOKEN")

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.AzureOpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                response = azure_text_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    api_type=api_type,
                    azure_ad_token=azure_ad_token,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    logging_obj=logging,
                    acompletion=acompletion,
                    timeout=timeout,
                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
                )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={
                            "headers": headers,
                            "api_version": api_version,
                            "api_base": api_base,
                        },
                    )
            elif (
                model in litellm.open_ai_chat_completion_models
                or custom_llm_provider == "custom_openai"
                or custom_llm_provider == "deepinfra"
                or custom_llm_provider == "perplexity"
                or custom_llm_provider == "groq"
                or custom_llm_provider == "anyscale"
                or custom_llm_provider == "mistral"
                or custom_llm_provider == "openai"
                or custom_llm_provider == "together_ai"
                or custom_llm_provider in litellm.openai_compatible_providers
                or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
            ):  # allow user to make an openai call with a custom base
                # note: if a user sets a custom base - we should ensure this works
                # allow for the setting of dynamic and stateful api-bases
                api_base = (
                    api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
                    or litellm.api_base
                    or get_secret("OPENAI_API_BASE")
                    or "https://api.openai.com/v1"
                )
                openai.organization = (
                    organization
                    or litellm.organization
                    or get_secret("OPENAI_ORGANIZATION")
                    or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
                )
                # set API KEY
                api_key = (
                    api_key
                    or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
                    or litellm.openai_key
                    or get_secret("OPENAI_API_KEY")
                )

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.OpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                try:
                    response = openai_chat_completions.completion(
                        model=model,
                        messages=messages,
                        headers=headers,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        api_key=api_key,
                        api_base=api_base,
                        acompletion=acompletion,
                        logging_obj=logging,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        timeout=timeout,
                        custom_prompt_dict=custom_prompt_dict,
                        client=client,  # pass AsyncOpenAI, OpenAI client
                        organization=organization,
                        custom_llm_provider=custom_llm_provider,
                    )
                except Exception as e:
                    ## LOGGING - log the original exception returned
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=str(e),
                        additional_args={"headers": headers},
                    )
                    raise e

                if optional_params.get("stream", False):
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={"headers": headers},
                    )
            elif (
                custom_llm_provider == "text-completion-openai"
                or "ft:babbage-002" in model
                or "ft:davinci-002" in model  # support for finetuned completion models
            ):
                openai.api_type = "openai"

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("OPENAI_API_BASE")
                    or "https://api.openai.com/v1"
                )

                openai.api_version = None
                # set API KEY

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.openai_key
                    or get_secret("OPENAI_API_KEY")
                )

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.OpenAITextCompletionConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v
                if litellm.organization:
                    openai.organization = litellm.organization

                if (
                    len(messages) > 0
                    and "content" in messages[0]
                    and type(messages[0]["content"]) == list
                ):
                    # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
                    # https://platform.openai.com/docs/api-reference/completions/create
                    prompt = messages[0]["content"]
                else:
                    prompt = " ".join([message["content"] for message in messages])  # type: ignore

                ## COMPLETION CALL
                _response = openai_text_completions.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    api_key=api_key,
                    api_base=api_base,
                    acompletion=acompletion,
                    client=client,  # pass AsyncOpenAI, OpenAI client
                    logging_obj=logging,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    timeout=timeout,
                )

                if (
                    optional_params.get("stream", False) == False
                    and acompletion == False
                    and text_completion == False
                ):
                    # convert to chat completion response
                    _response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
                        response_object=_response, model_response_object=model_response
                    )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=_response,
                        additional_args={"headers": headers},
                    )
                response = _response
            elif (
                "replicate" in model
                or custom_llm_provider == "replicate"
                or model in litellm.replicate_models
            ):
                # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
                replicate_key = None
                replicate_key = (
                    api_key
                    or litellm.replicate_key
                    or litellm.api_key
                    or get_secret("REPLICATE_API_KEY")
                    or get_secret("REPLICATE_API_TOKEN")
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("REPLICATE_API_BASE")
                    or "https://api.replicate.com/v1"
                )

                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

>               model_response = replicate.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,  # for calculating input/output tokens
                    api_key=replicate_key,
                    logging_obj=logging,
                    custom_prompt_dict=custom_prompt_dict,
                )

../main.py:1123:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

model = 'meta/llama-2-7b-chat'
messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
api_base = 'https://api.replicate.com/v1'
model_response = ModelResponse(id='chatcmpl-64c87434-83ce-4436-ac82-065d03e85dbd', choices=[Choices(finish_reason='stop', index=0, mess... role='assistant'))], created=1712723703, model=None, object='chat.completion', system_fingerprint=None, usage=Usage())
print_verbose = <function print_verbose at 0x108fa1440>
logging_obj = <litellm.utils.Logging object at 0x1043e1550>
api_key = 'r8_KkH9pMk1MOj0GTBijCFEGx5RpcDWd6K2jGKQK'
encoding = <Encoding 'cl100k_base'>
custom_prompt_dict = {'meta/llama-2-7b-chat': {'final_prompt_value': 'Now answer as best you can:', 'initial_prompt_value': 'You are a good...S>>\n [/INST]\n', 'pre_message': '[INST] <<SYS>>\n'}, 'user': {'post_message': ' [/INST]', 'pre_message': '[INST] '}}}}
optional_params = {'repetition_penalty': 0.1}
litellm_params = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1/models/meta/llama-2-7b-chat', 'api_key': None, 'completion_call_id': None, ...}
logger_fn = None

    def completion(
        model: str,
        messages: list,
        api_base: str,
        model_response: ModelResponse,
        print_verbose: Callable,
        logging_obj,
        api_key,
        encoding,
        custom_prompt_dict={},
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
    ):
        # Start a prediction and get the prediction URL
        version_id = model_to_version_id(model)
        ## Load Config
        config = litellm.ReplicateConfig.get_config()
        for k, v in config.items():
            if (
                k not in optional_params
            ):  # completion(top_k=3) > replicate_config(top_k=3) <- allows for dynamic variables to be passed in
                optional_params[k] = v

        system_prompt = None
        if optional_params is not None and "supports_system_prompt" in optional_params:
            supports_sys_prompt = optional_params.pop("supports_system_prompt")
        else:
            supports_sys_prompt = False

        if supports_sys_prompt:
            for i in range(len(messages)):
                if messages[i]["role"] == "system":
                    first_sys_message = messages.pop(i)
                    system_prompt = first_sys_message["content"]
                    break

        if model in custom_prompt_dict:
            # check if the model has a registered custom prompt
            model_prompt_details = custom_prompt_dict[model]
            prompt = custom_prompt(
                role_dict=model_prompt_details.get("roles", {}),
                initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
                final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
                bos_token=model_prompt_details.get("bos_token", ""),
                eos_token=model_prompt_details.get("eos_token", ""),
                messages=messages,
            )
        else:
            prompt = prompt_factory(model=model, messages=messages)

        # If system prompt is supported, and a system prompt is provided, use it
        if system_prompt is not None:
            input_data = {
                "prompt": prompt,
                "system_prompt": system_prompt,
                **optional_params,
            }
        # Otherwise, use the prompt as is
        else:
            input_data = {"prompt": prompt, **optional_params}

        ## COMPLETION CALL
        ## Replicate Compeltion calls have 2 steps
        ## Step1: Start Prediction: gets a prediction url
        ## Step2: Poll prediction url for response
        ## Step2: is handled with and without streaming
        model_response["created"] = int(
            time.time()
        )  # for pricing this must remain right before calling api
        prediction_url = start_prediction(
            version_id,
            input_data,
            api_key,
            api_base,
            logging_obj=logging_obj,
            print_verbose=print_verbose,
        )
        print_verbose(prediction_url)

        # Handle the prediction response (streaming or non-streaming)
        if "stream" in optional_params and optional_params["stream"] == True:
            print_verbose("streaming request")
            return handle_prediction_response_streaming(
                prediction_url, api_key, print_verbose
            )
        else:
>           result, logs = handle_prediction_response(
                prediction_url, api_key, print_verbose
            )

../llms/replicate.py:307:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

prediction_url = 'https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg'
api_token = 'r8_KkH9pMk1MOj0GTBijCFEGx5RpcDWd6K2jGKQK'
print_verbose = <function print_verbose at 0x108fa1440>

    def handle_prediction_response(prediction_url, api_token, print_verbose):
        output_string = ""
        headers = {
            "Authorization": f"Token {api_token}",
            "Content-Type": "application/json",
        }

        status = ""
        logs = ""
        while True and (status not in ["succeeded", "failed", "canceled"]):
            print_verbose(f"replicate: polling endpoint: {prediction_url}")
            time.sleep(0.5)
            response = requests.get(prediction_url, headers=headers)
            if response.status_code == 200:
                response_data = response.json()
                if "output" in response_data:
                    output_string = "".join(response_data["output"])
                    print_verbose(f"Non-streamed output:{output_string}")
                status = response_data.get("status", None)
                logs = response_data.get("logs", "")
                if status == "failed":
                    replicate_error = response_data.get("error", "")
>                   raise ReplicateError(
                        status_code=400,
                        message=f"Error: {replicate_error}, \nReplicate logs:{logs}",
                    )
E                   litellm.llms.replicate.ReplicateError: Error: Traceback (most recent call last):
E                     2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                           at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                     1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                           at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                     0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                           at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                     File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                   TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!,
E                   Replicate logs:MLC is currently not using any LoRAs.
E                   MLC: True
E                   Your formatted prompt is:
E                   [INST] <<SYS>>
E                   You are a helpful, respectful and honest assistant.
E                   <</SYS>>
E                   You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
E                   Not using LoRA
E                   Traceback (most recent call last):
E                   File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
E                   for r in result:
E                   File "/src/predict.py", line 198, in predict
E                   for decoded_token in self.engine(
E                   File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
E                   for val in gen:
E                   File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
E                   self.cm.reset_chat(chat_config)
E                   File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
E                   self._load_json_override_func(user_chat_config_json_str, True)
E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
E                   File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
E                   File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
E                   File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
E                   raise py_err
E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                   tvm._ffi.base.TVMError: Traceback (most recent call last):
E                   2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                   at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                   1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                   at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                   0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                   at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                   File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                   TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!

../llms/replicate.py:165: ReplicateError

During handling of the above exception, another exception occurred:

args = ()
kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x...ssages': [{'content': 'what is yc write 1 paragraph', 'role': 'user'}], 'model': 'replicate/meta/llama-2-7b-chat', ...}
result = None, start_time = datetime.datetime(2024, 4, 9, 21, 35, 3, 984661)
logging_obj = <litellm.utils.Logging object at 0x1043e1550>
call_type = 'completion', model = 'replicate/meta/llama-2-7b-chat'
k = 'litellm_logging_obj'

    @wraps(original_function)
    def wrapper(*args, **kwargs):
        # DO NOT MOVE THIS. It always needs to run first
        # Check if this is an async function. If so only execute the async function
        if (
            kwargs.get("acompletion", False) == True
            or kwargs.get("aembedding", False) == True
            or kwargs.get("aimg_generation", False) == True
            or kwargs.get("amoderation", False) == True
            or kwargs.get("atext_completion", False) == True
            or kwargs.get("atranscription", False) == True
        ):
            # [OPTIONAL] CHECK MAX RETRIES / REQUEST
            if litellm.num_retries_per_request is not None:
                # check if previous_models passed in as ['litellm_params']['metadata]['previous_models']
                previous_models = kwargs.get("metadata", {}).get(
                    "previous_models", None
                )
                if previous_models is not None:
                    if litellm.num_retries_per_request <= len(previous_models):
                        raise Exception(f"Max retries per request hit!")

            # MODEL CALL
            result = original_function(*args, **kwargs)
            if "stream" in kwargs and kwargs["stream"] == True:
                if (
                    "complete_response" in kwargs
                    and kwargs["complete_response"] == True
                ):
                    chunks = []
                    for idx, chunk in enumerate(result):
                        chunks.append(chunk)
                    return litellm.stream_chunk_builder(
                        chunks, messages=kwargs.get("messages", None)
                    )
                else:
                    return result
            return result

        # Prints Exactly what was passed to litellm function - don't execute any logic here - it should just print
        print_args_passed_to_litellm(original_function, args, kwargs)
        start_time = datetime.datetime.now()
        result = None
        logging_obj = kwargs.get("litellm_logging_obj", None)

        # only set litellm_call_id if its not in kwargs
        call_type = original_function.__name__
        if "litellm_call_id" not in kwargs:
            kwargs["litellm_call_id"] = str(uuid.uuid4())
        try:
            model = args[0] if len(args) > 0 else kwargs["model"]
        except:
            model = None
            if (
                call_type != CallTypes.image_generation.value
                and call_type != CallTypes.text_completion.value
            ):
                raise ValueError("model param not passed in.")

        try:
            if logging_obj is None:
                logging_obj, kwargs = function_setup(start_time, *args, **kwargs)
            kwargs["litellm_logging_obj"] = logging_obj

            # CHECK FOR 'os.environ/' in kwargs
            for k, v in kwargs.items():
                if v is not None and isinstance(v, str) and v.startswith("os.environ/"):
                    kwargs[k] = litellm.get_secret(v)
            # [OPTIONAL] CHECK BUDGET
            if litellm.max_budget:
                if litellm._current_cost > litellm.max_budget:
                    raise BudgetExceededError(
                        current_cost=litellm._current_cost,
                        max_budget=litellm.max_budget,
                    )

            # [OPTIONAL] CHECK MAX RETRIES / REQUEST
            if litellm.num_retries_per_request is not None:
                # check if previous_models passed in as ['litellm_params']['metadata]['previous_models']
                previous_models = kwargs.get("metadata", {}).get(
                    "previous_models", None
                )
                if previous_models is not None:
                    if litellm.num_retries_per_request <= len(previous_models):
                        raise Exception(f"Max retries per request hit!")

            # [OPTIONAL] CHECK CACHE
            print_verbose(
                f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}"
            )
            # if caching is false or cache["no-cache"]==True, don't run this
            if (
                (
                    (
                        kwargs.get("caching", None) is None
                        and kwargs.get("cache", None) is None
                        and litellm.cache is not None
                    )
                    or kwargs.get("caching", False) == True
                    or (
                        kwargs.get("cache", None) is not None
                        and kwargs.get("cache", {}).get("no-cache", False) != True
                    )
                )
                and kwargs.get("aembedding", False) != True
                and kwargs.get("acompletion", False) != True
                and kwargs.get("aimg_generation", False) != True
                and kwargs.get("atranscription", False) != True
            ):  # allow users to control returning cached responses from the completion function
                # checking cache
                print_verbose(f"INSIDE CHECKING CACHE")
                if (
                    litellm.cache is not None
                    and str(original_function.__name__)
                    in litellm.cache.supported_call_types
                ):
                    print_verbose(f"Checking Cache")
                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                    kwargs["preset_cache_key"] = (
                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                    )
                    cached_result = litellm.cache.get_cache(*args, **kwargs)
                    if cached_result != None:
                        if "detail" in cached_result:
                            # implies an error occurred
                            pass
                        else:
                            call_type = original_function.__name__
                            print_verbose(
                                f"Cache Response Object routing: call_type - {call_type}; cached_result instace: {type(cached_result)}"
                            )
                            if call_type == CallTypes.completion.value and isinstance(
                                cached_result, dict
                            ):
                                cached_result = convert_to_model_response_object(
                                    response_object=cached_result,
                                    model_response_object=ModelResponse(),
                                    stream=kwargs.get("stream", False),
                                )
                                if kwargs.get("stream", False) == True:
                                    cached_result = CustomStreamWrapper(
                                        completion_stream=cached_result,
                                        model=model,
                                        custom_llm_provider="cached_response",
                                        logging_obj=logging_obj,
                                    )
                            elif call_type == CallTypes.embedding.value and isinstance(
                                cached_result, dict
                            ):
                                cached_result = convert_to_model_response_object(
                                    response_object=cached_result,
                                    response_type="embedding",
                                )

                            # LOG SUCCESS
                            cache_hit = True
                            end_time = datetime.datetime.now()
                            (
                                model,
                                custom_llm_provider,
                                dynamic_api_key,
                                api_base,
                            ) = litellm.get_llm_provider(
                                model=model,
                                custom_llm_provider=kwargs.get(
                                    "custom_llm_provider", None
                                ),
                                api_base=kwargs.get("api_base", None),
                                api_key=kwargs.get("api_key", None),
                            )
                            print_verbose(
                                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
                            )
                            logging_obj.update_environment_variables(
                                model=model,
                                user=kwargs.get("user", None),
                                optional_params={},
                                litellm_params={
                                    "logger_fn": kwargs.get("logger_fn", None),
                                    "acompletion": False,
                                    "metadata": kwargs.get("metadata", {}),
                                    "model_info": kwargs.get("model_info", {}),
                                    "proxy_server_request": kwargs.get(
                                        "proxy_server_request", None
                                    ),
                                    "preset_cache_key": kwargs.get(
                                        "preset_cache_key", None
                                    ),
                                    "stream_response": kwargs.get(
                                        "stream_response", {}
                                    ),
                                },
                                input=kwargs.get("messages", ""),
                                api_key=kwargs.get("api_key", None),
                                original_response=str(cached_result),
                                additional_args=None,
                                stream=kwargs.get("stream", False),
                            )
                            threading.Thread(
                                target=logging_obj.success_handler,
                                args=(cached_result, start_time, end_time, cache_hit),
                            ).start()
                            return cached_result

            # CHECK MAX TOKENS
            if (
                kwargs.get("max_tokens", None) is not None
                and model is not None
                and litellm.modify_params
                == True  # user is okay with params being modified
                and (
                    call_type == CallTypes.acompletion.value
                    or call_type == CallTypes.completion.value
                )
            ):
                try:
                    base_model = model
                    if kwargs.get("hf_model_name", None) is not None:
                        base_model = f"huggingface/{kwargs.get('hf_model_name')}"
                    max_output_tokens = (
                        get_max_tokens(model=base_model) or 4096
                    )  # assume min context window is 4k tokens
                    user_max_tokens = kwargs.get("max_tokens")
                    ## Scenario 1: User limit + prompt > model limit
                    messages = None
                    if len(args) > 1:
                        messages = args[1]
                    elif kwargs.get("messages", None):
                        messages = kwargs["messages"]
                    input_tokens = token_counter(model=base_model, messages=messages)
                    input_tokens += max(
                        0.1 * input_tokens, 10
                    )  # give at least a 10 token buffer. token counting can be imprecise.
                    if input_tokens > max_output_tokens:
                        pass  # allow call to fail normally
                    elif user_max_tokens + input_tokens > max_output_tokens:
                        user_max_tokens = max_output_tokens - input_tokens
                    print_verbose(f"user_max_tokens: {user_max_tokens}")
                    kwargs["max_tokens"] = int(
                        round(user_max_tokens)
                    )  # make sure max tokens is always an int
                except Exception as e:
                    print_verbose(f"Error while checking max token limit: {str(e)}")
            # MODEL CALL
>           result = original_function(*args, **kwargs)

../utils.py:2846:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

model = 'meta/llama-2-7b-chat'
messages = [{'content': 'what is yc write 1 paragraph', 'role': 'user'}]
timeout = 600.0, temperature = None, top_p = None, n = None, stream = None
stop = None, max_tokens = None, presence_penalty = None
frequency_penalty = None, logit_bias = None, user = None, response_format = None
seed = None, tools = None, tool_choice = None, logprobs = None
top_logprobs = None, deployment_id = None, extra_headers = None
functions = None, function_call = None, base_url = None, api_version = None
api_key = None, model_list = None
kwargs = {'litellm_call_id': '85a47e72-fb66-4654-85d4-6b34fbf52a0e', 'litellm_logging_obj': <litellm.utils.Logging object at 0x1043e1550>, 'num_retries': 3, 'repetition_penalty': 0.1}
args = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}
api_base = 'https://api.replicate.com/v1', mock_response = None
force_timeout = 600, logger_fn = None, verbose = False
custom_llm_provider = 'replicate'

    @client
    def completion(
        model: str,
        # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
        messages: List = [],
        timeout: Optional[Union[float, int]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        n: Optional[int] = None,
        stream: Optional[bool] = None,
        stop=None,
        max_tokens: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[dict] = None,
        user: Optional[str] = None,
        # openai v1.0+ new params
        response_format: Optional[dict] = None,
        seed: Optional[int] = None,
        tools: Optional[List] = None,
        tool_choice: Optional[str] = None,
        logprobs: Optional[bool] = None,
        top_logprobs: Optional[int] = None,
        deployment_id=None,
        extra_headers: Optional[dict] = None,
        # soon to be deprecated params by OpenAI
        functions: Optional[List] = None,
        function_call: Optional[str] = None,
        # set api_base, api_version, api_key
        base_url: Optional[str] = None,
        api_version: Optional[str] = None,
        api_key: Optional[str] = None,
        model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
        # Optional liteLLM function params
        **kwargs,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
        """
        Perform a completion() using any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)
        Parameters:
            model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
            messages (List): A list of message objects representing the conversation context (default is an empty list).

            OPTIONAL PARAMS
            functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
            function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
            temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
            top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
            n (int, optional): The number of completions to generate (default is 1).
            stream (bool, optional): If True, return a streaming response (default is False).
            stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
            max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
            presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
            frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
            logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
            user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
            logprobs (bool, optional): Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message
            top_logprobs (int, optional): An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
            metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc.
            api_base (str, optional): Base URL for the API (default is None).
            api_version (str, optional): API version (default is None).
            api_key (str, optional): API key (default is None).
            model_list (list, optional): List of api base, version, keys
            extra_headers (dict, optional): Additional headers to include in the request.

            LITELLM Specific Params
            mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
            custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
            max_retries (int, optional): The number of retries to attempt (default is 0).
        Returns:
            ModelResponse: A response object containing the generated completion and associated metadata.

        Note:
            - This function is used to perform completions() using the specified language model.
            - It supports various optional parameters for customizing the completion behavior.
            - If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
        """
        ######### unpacking kwargs #####################
        args = locals()
        api_base = kwargs.get("api_base", None)
        mock_response = kwargs.get("mock_response", None)
        force_timeout = kwargs.get("force_timeout", 600)  ## deprecated
        logger_fn = kwargs.get("logger_fn", None)
        verbose = kwargs.get("verbose", False)
        custom_llm_provider = kwargs.get("custom_llm_provider", None)
        litellm_logging_obj = kwargs.get("litellm_logging_obj", None)
        id = kwargs.get("id", None)
        metadata = kwargs.get("metadata", None)
        model_info = kwargs.get("model_info", None)
        proxy_server_request = kwargs.get("proxy_server_request", None)
        fallbacks = kwargs.get("fallbacks", None)
        headers = kwargs.get("headers", None)
        num_retries = kwargs.get("num_retries", None)  ## deprecated
        max_retries = kwargs.get("max_retries", None)
        context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
        organization = kwargs.get("organization", None)
        ### CUSTOM MODEL COST ###
        input_cost_per_token = kwargs.get("input_cost_per_token", None)
        output_cost_per_token = kwargs.get("output_cost_per_token", None)
        input_cost_per_second = kwargs.get("input_cost_per_second", None)
        output_cost_per_second = kwargs.get("output_cost_per_second", None)
        ### CUSTOM PROMPT TEMPLATE ###
        initial_prompt_value = kwargs.get("initial_prompt_value", None)
        roles = kwargs.get("roles", None)
        final_prompt_value = kwargs.get("final_prompt_value", None)
        bos_token = kwargs.get("bos_token", None)
        eos_token = kwargs.get("eos_token", None)
        preset_cache_key = kwargs.get("preset_cache_key", None)
        hf_model_name = kwargs.get("hf_model_name", None)
        ### TEXT COMPLETION CALLS ###
        text_completion = kwargs.get("text_completion", False)
        atext_completion = kwargs.get("atext_completion", False)
        ### ASYNC CALLS ###
        acompletion = kwargs.get("acompletion", False)
        client = kwargs.get("client", None)
        ### Admin Controls ###
        no_log = kwargs.get("no-log", False)
        ######## end of unpacking kwargs ###########
        openai_params = [
            "functions",
            "function_call",
            "temperature",
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "request_timeout",
            "api_base",
            "api_version",
            "api_key",
            "deployment_id",
            "organization",
            "base_url",
            "default_headers",
            "timeout",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
            "logprobs",
            "top_logprobs",
            "extra_headers",
        ]
        litellm_params = [
            "metadata",
            "acompletion",
            "atext_completion",
            "text_completion",
            "caching",
            "mock_response",
            "api_key",
            "api_version",
            "api_base",
            "force_timeout",
            "logger_fn",
            "verbose",
            "custom_llm_provider",
            "litellm_logging_obj",
            "litellm_call_id",
            "use_client",
            "id",
            "fallbacks",
            "azure",
            "headers",
            "model_list",
            "num_retries",
            "context_window_fallback_dict",
            "roles",
            "final_prompt_value",
            "bos_token",
            "eos_token",
            "request_timeout",
            "complete_response",
            "self",
            "client",
            "rpm",
            "tpm",
            "input_cost_per_token",
            "output_cost_per_token",
            "input_cost_per_second",
            "output_cost_per_second",
            "hf_model_name",
            "model_info",
            "proxy_server_request",
            "preset_cache_key",
            "caching_groups",
            "ttl",
            "cache",
            "no-log",
            "base_model",
            "stream_timeout",
        ]
        default_params = openai_params + litellm_params
        non_default_params = {
            k: v for k, v in kwargs.items() if k not in default_params
        }  # model-specific params - pass them straight to the model/provider
        if timeout is None:
            timeout = (
                kwargs.get("request_timeout", None) or 600
            )  # set timeout for 10 minutes by default
        timeout = float(timeout)
        try:
            if base_url is not None:
                api_base = base_url
            if max_retries is not None:  # openai allows openai.OpenAI(max_retries=3)
                num_retries = max_retries
            logging = litellm_logging_obj
            fallbacks = fallbacks or litellm.model_fallbacks
            if fallbacks is not None:
                return completion_with_fallbacks(**args)
            if model_list is not None:
                deployments = [
                    m["litellm_params"] for m in model_list if m["model_name"] == model
                ]
                return batch_completion_models(deployments=deployments, **args)
            if litellm.model_alias_map and model in litellm.model_alias_map:
                model = litellm.model_alias_map[
                    model
                ]  # update the model to the actual value if an alias has been passed in
            model_response = ModelResponse()
            if (
                kwargs.get("azure", False) == True
            ):  # don't remove flag check, to remain backwards compatible for repos like Codium
                custom_llm_provider = "azure"
            if deployment_id != None:  # azure llms
                model = deployment_id
                custom_llm_provider = "azure"
            model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(
                model=model,
                custom_llm_provider=custom_llm_provider,
                api_base=api_base,
                api_key=api_key,
            )
            if model_response is not None and hasattr(model_response, "_hidden_params"):
                model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
                model_response._hidden_params["region_name"] = kwargs.get(
                    "aws_region_name", None
                )  # support region-based pricing for bedrock

            ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
            if input_cost_per_token is not None and output_cost_per_token is not None:
                print_verbose(f"Registering model={model} in model cost map")
                litellm.register_model(
                    {
                        f"{custom_llm_provider}/{model}": {
                            "input_cost_per_token": input_cost_per_token,
                            "output_cost_per_token": output_cost_per_token,
                            "litellm_provider": custom_llm_provider,
                        },
                        model: {
                            "input_cost_per_token": input_cost_per_token,
                            "output_cost_per_token": output_cost_per_token,
                            "litellm_provider": custom_llm_provider,
                        },
                    }
                )
            elif (
                input_cost_per_second is not None
            ):  # time based pricing just needs cost in place
                output_cost_per_second = output_cost_per_second
                litellm.register_model(
                    {
                        f"{custom_llm_provider}/{model}": {
                            "input_cost_per_second": input_cost_per_second,
                            "output_cost_per_second": output_cost_per_second,
                            "litellm_provider": custom_llm_provider,
                        },
                        model: {
                            "input_cost_per_second": input_cost_per_second,
                            "output_cost_per_second": output_cost_per_second,
                            "litellm_provider": custom_llm_provider,
                        },
                    }
                )
            ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
            custom_prompt_dict = {}  # type: ignore
            if (
                initial_prompt_value
                or roles
                or final_prompt_value
                or bos_token
                or eos_token
            ):
                custom_prompt_dict = {model: {}}
                if initial_prompt_value:
                    custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
                if roles:
                    custom_prompt_dict[model]["roles"] = roles
                if final_prompt_value:
                    custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
                if bos_token:
                    custom_prompt_dict[model]["bos_token"] = bos_token
                if eos_token:
                    custom_prompt_dict[model]["eos_token"] = eos_token
            model_api_key = get_api_key(
                llm_provider=custom_llm_provider, dynamic_api_key=api_key
            )  # get the api key from the environment if required for the model

            if dynamic_api_key is not None:
                api_key = dynamic_api_key
            # check if user passed in any of the OpenAI optional params
            optional_params = get_optional_params(
                functions=functions,
                function_call=function_call,
                temperature=temperature,
                top_p=top_p,
                n=n,
                stream=stream,
                stop=stop,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                logit_bias=logit_bias,
                user=user,
                # params to identify the model
                model=model,
                custom_llm_provider=custom_llm_provider,
                response_format=response_format,
                seed=seed,
                tools=tools,
                tool_choice=tool_choice,
                max_retries=max_retries,
                logprobs=logprobs,
                top_logprobs=top_logprobs,
                extra_headers=extra_headers,
                **non_default_params,
            )

            if litellm.add_function_to_prompt and optional_params.get(
                "functions_unsupported_model", None
            ):  # if user opts to add it to prompt, when API doesn't support function calling
                functions_unsupported_model = optional_params.pop(
                    "functions_unsupported_model"
                )
                messages = function_call_prompt(
                    messages=messages, functions=functions_unsupported_model
                )

            # For logging - save the values of the litellm-specific params passed in
            litellm_params = get_litellm_params(
                acompletion=acompletion,
                api_key=api_key,
                force_timeout=force_timeout,
                logger_fn=logger_fn,
                verbose=verbose,
                custom_llm_provider=custom_llm_provider,
                api_base=api_base,
                litellm_call_id=kwargs.get("litellm_call_id", None),
                model_alias_map=litellm.model_alias_map,
                completion_call_id=id,
                metadata=metadata,
                model_info=model_info,
                proxy_server_request=proxy_server_request,
                preset_cache_key=preset_cache_key,
                no_log=no_log,
            )
            logging.update_environment_variables(
                model=model,
                user=user,
                optional_params=optional_params,
                litellm_params=litellm_params,
            )
            if mock_response:
                return mock_completion(
                    model,
                    messages,
                    stream=stream,
                    mock_response=mock_response,
                    logging=logging,
                    acompletion=acompletion,
                )
            if custom_llm_provider == "azure":
                # azure configs
                api_type = get_secret("AZURE_API_TYPE") or "azure"

                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")

                api_version = (
                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
                )

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.azure_key
                    or get_secret("AZURE_OPENAI_API_KEY")
                    or get_secret("AZURE_API_KEY")
                )

                azure_ad_token = optional_params.get("extra_body", {}).pop(
                    "azure_ad_token", None
                ) or get_secret("AZURE_AD_TOKEN")

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.AzureOpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                response = azure_chat_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    api_type=api_type,
                    azure_ad_token=azure_ad_token,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    logging_obj=logging,
                    acompletion=acompletion,
                    timeout=timeout,
                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
                )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={
                            "headers": headers,
                            "api_version": api_version,
                            "api_base": api_base,
                        },
                    )
            elif custom_llm_provider == "azure_text":
                # azure configs
                api_type = get_secret("AZURE_API_TYPE") or "azure"

                api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")

                api_version = (
                    api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
                )

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.azure_key
                    or get_secret("AZURE_OPENAI_API_KEY")
                    or get_secret("AZURE_API_KEY")
                )

                azure_ad_token = optional_params.get("extra_body", {}).pop(
                    "azure_ad_token", None
                ) or get_secret("AZURE_AD_TOKEN")

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.AzureOpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > azure_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                response = azure_text_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    api_type=api_type,
                    azure_ad_token=azure_ad_token,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    logging_obj=logging,
                    acompletion=acompletion,
                    timeout=timeout,
                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
                )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={
                            "headers": headers,
                            "api_version": api_version,
                            "api_base": api_base,
                        },
                    )
            elif (
                model in litellm.open_ai_chat_completion_models
                or custom_llm_provider == "custom_openai"
                or custom_llm_provider == "deepinfra"
                or custom_llm_provider == "perplexity"
                or custom_llm_provider == "groq"
                or custom_llm_provider == "anyscale"
                or custom_llm_provider == "mistral"
                or custom_llm_provider == "openai"
                or custom_llm_provider == "together_ai"
                or custom_llm_provider in litellm.openai_compatible_providers
                or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
            ):  # allow user to make an openai call with a custom base
                # note: if a user sets a custom base - we should ensure this works
                # allow for the setting of dynamic and stateful api-bases
                api_base = (
                    api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
                    or litellm.api_base
                    or get_secret("OPENAI_API_BASE")
                    or "https://api.openai.com/v1"
                )
                openai.organization = (
                    organization
                    or litellm.organization
                    or get_secret("OPENAI_ORGANIZATION")
                    or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
                )
                # set API KEY
                api_key = (
                    api_key
                    or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
                    or litellm.openai_key
                    or get_secret("OPENAI_API_KEY")
                )

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.OpenAIConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v

                ## COMPLETION CALL
                try:
                    response = openai_chat_completions.completion(
                        model=model,
                        messages=messages,
                        headers=headers,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        api_key=api_key,
                        api_base=api_base,
                        acompletion=acompletion,
                        logging_obj=logging,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        timeout=timeout,
                        custom_prompt_dict=custom_prompt_dict,
                        client=client,  # pass AsyncOpenAI, OpenAI client
                        organization=organization,
                        custom_llm_provider=custom_llm_provider,
                    )
                except Exception as e:
                    ## LOGGING - log the original exception returned
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=str(e),
                        additional_args={"headers": headers},
                    )
                    raise e

                if optional_params.get("stream", False):
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                        additional_args={"headers": headers},
                    )
            elif (
                custom_llm_provider == "text-completion-openai"
                or "ft:babbage-002" in model
                or "ft:davinci-002" in model  # support for finetuned completion models
            ):
                openai.api_type = "openai"

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("OPENAI_API_BASE")
                    or "https://api.openai.com/v1"
                )

                openai.api_version = None
                # set API KEY

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.openai_key
                    or get_secret("OPENAI_API_KEY")
                )

                headers = headers or litellm.headers

                ## LOAD CONFIG - if set
                config = litellm.OpenAITextCompletionConfig.get_config()
                for k, v in config.items():
                    if (
                        k not in optional_params
                    ):  # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
                        optional_params[k] = v
                if litellm.organization:
                    openai.organization = litellm.organization

                if (
                    len(messages) > 0
                    and "content" in messages[0]
                    and type(messages[0]["content"]) == list
                ):
                    # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
                    # https://platform.openai.com/docs/api-reference/completions/create
                    prompt = messages[0]["content"]
                else:
                    prompt = " ".join([message["content"] for message in messages])  # type: ignore

                ## COMPLETION CALL
                _response = openai_text_completions.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    api_key=api_key,
                    api_base=api_base,
                    acompletion=acompletion,
                    client=client,  # pass AsyncOpenAI, OpenAI client
                    logging_obj=logging,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    timeout=timeout,
                )

                if (
                    optional_params.get("stream", False) == False
                    and acompletion == False
                    and text_completion == False
                ):
                    # convert to chat completion response
                    _response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
                        response_object=_response, model_response_object=model_response
                    )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=_response,
                        additional_args={"headers": headers},
                    )
                response = _response
            elif (
                "replicate" in model
                or custom_llm_provider == "replicate"
                or model in litellm.replicate_models
            ):
                # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
                replicate_key = None
                replicate_key = (
                    api_key
                    or litellm.replicate_key
                    or litellm.api_key
                    or get_secret("REPLICATE_API_KEY")
                    or get_secret("REPLICATE_API_TOKEN")
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("REPLICATE_API_BASE")
                    or "https://api.replicate.com/v1"
                )

                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

                model_response = replicate.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,  # for calculating input/output tokens
                    api_key=replicate_key,
                    logging_obj=logging,
                    custom_prompt_dict=custom_prompt_dict,
                )
                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    model_response = CustomStreamWrapper(model_response, model, logging_obj=logging, custom_llm_provider="replicate")  # type: ignore

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=replicate_key,
                        original_response=model_response,
                    )

                response = model_response

            elif custom_llm_provider == "anthropic":
                api_key = (
                    api_key
                    or litellm.anthropic_key
                    or litellm.api_key
                    or os.environ.get("ANTHROPIC_API_KEY")
                )
                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

                if (model == "claude-2") or (model == "claude-instant-1"):
                    # call anthropic /completion, only use this route for claude-2, claude-instant-1
                    api_base = (
                        api_base
                        or litellm.api_base
                        or get_secret("ANTHROPIC_API_BASE")
                        or "https://api.anthropic.com/v1/complete"
                    )
                    response = anthropic_text.completion(
                        model=model,
                        messages=messages,
                        api_base=api_base,
                        custom_prompt_dict=litellm.custom_prompt_dict,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        encoding=encoding,  # for calculating input/output tokens
                        api_key=api_key,
                        logging_obj=logging,
                        headers=headers,
                    )
                else:
                    # call /messages
                    # default route for all anthropic models
                    api_base = (
                        api_base
                        or litellm.api_base
                        or get_secret("ANTHROPIC_API_BASE")
                        or "https://api.anthropic.com/v1/messages"
                    )
                    response = anthropic_chat_completions.completion(
                        model=model,
                        messages=messages,
                        api_base=api_base,
                        acompletion=acompletion,
                        custom_prompt_dict=litellm.custom_prompt_dict,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        encoding=encoding,  # for calculating input/output tokens
                        api_key=api_key,
                        logging_obj=logging,
                        headers=headers,
                    )
                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                    )
                response = response
            elif custom_llm_provider == "nlp_cloud":
                nlp_cloud_key = (
                    api_key
                    or litellm.nlp_cloud_key
                    or get_secret("NLP_CLOUD_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("NLP_CLOUD_API_BASE")
                    or "https://api.nlpcloud.io/v1/gpu/"
                )

                response = nlp_cloud.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=nlp_cloud_key,
                    logging_obj=logging,
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        response,
                        model,
                        custom_llm_provider="nlp_cloud",
                        logging_obj=logging,
                    )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                    )

                response = response
            elif custom_llm_provider == "aleph_alpha":
                aleph_alpha_key = (
                    api_key
                    or litellm.aleph_alpha_key
                    or get_secret("ALEPH_ALPHA_API_KEY")
                    or get_secret("ALEPHALPHA_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("ALEPH_ALPHA_API_BASE")
                    or "https://api.aleph-alpha.com/complete"
                )

                model_response = aleph_alpha.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    default_max_tokens_to_sample=litellm.max_tokens,
                    api_key=aleph_alpha_key,
                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="aleph_alpha",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "cohere":
                cohere_key = (
                    api_key
                    or litellm.cohere_key
                    or get_secret("COHERE_API_KEY")
                    or get_secret("CO_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("COHERE_API_BASE")
                    or "https://api.cohere.ai/v1/generate"
                )

                model_response = cohere.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=cohere_key,
                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="cohere",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "cohere_chat":
                cohere_key = (
                    api_key
                    or litellm.cohere_key
                    or get_secret("COHERE_API_KEY")
                    or get_secret("CO_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("COHERE_API_BASE")
                    or "https://api.cohere.ai/v1/chat"
                )

                model_response = cohere_chat.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=cohere_key,
                    logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="cohere_chat",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "maritalk":
                maritalk_key = (
                    api_key
                    or litellm.maritalk_key
                    or get_secret("MARITALK_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("MARITALK_API_BASE")
                    or "https://chat.maritaca.ai/api/chat/inference"
                )

                model_response = maritalk.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=maritalk_key,
                    logging_obj=logging,
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="maritalk",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "huggingface":
                custom_llm_provider = "huggingface"
                huggingface_key = (
                    api_key
                    or litellm.huggingface_key
                    or os.environ.get("HF_TOKEN")
                    or os.environ.get("HUGGINGFACE_API_KEY")
                    or litellm.api_key
                )
                hf_headers = headers or litellm.headers

                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
                model_response = huggingface.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,  # type: ignore
                    headers=hf_headers,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=huggingface_key,
                    acompletion=acompletion,
                    logging_obj=logging,
                    custom_prompt_dict=custom_prompt_dict,
                    timeout=timeout,
                )
                if (
                    "stream" in optional_params
                    and optional_params["stream"] == True
                    and acompletion is False
                ):
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="huggingface",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "oobabooga":
                custom_llm_provider = "oobabooga"
                model_response = oobabooga.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    api_base=api_base,  # type: ignore
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    api_key=None,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    logging_obj=logging,
                )
                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="oobabooga",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "openrouter":
                api_base = api_base or litellm.api_base or "https://openrouter.ai/api/v1"

                api_key = (
                    api_key
                    or litellm.api_key
                    or litellm.openrouter_key
                    or get_secret("OPENROUTER_API_KEY")
                    or get_secret("OR_API_KEY")
                )

                openrouter_site_url = get_secret("OR_SITE_URL") or "https://litellm.ai"

                openrouter_app_name = get_secret("OR_APP_NAME") or "liteLLM"

                headers = (
                    headers
                    or litellm.headers
                    or {
                        "HTTP-Referer": openrouter_site_url,
                        "X-Title": openrouter_app_name,
                    }
                )

                ## Load Config
                config = openrouter.OpenrouterConfig.get_config()
                for k, v in config.items():
                    if k == "extra_body":
                        # we use openai 'extra_body' to pass openrouter specific params - transforms, route, models
                        if "extra_body" in optional_params:
                            optional_params[k].update(v)
                        else:
                            optional_params[k] = v
                    elif k not in optional_params:
                        optional_params[k] = v

                data = {"model": model, "messages": messages, **optional_params}

                ## COMPLETION CALL
                response = openai_chat_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
                    api_key=api_key,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    logging_obj=logging,
                    acompletion=acompletion,
                    timeout=timeout,
                )
                ## LOGGING
                logging.post_call(
                    input=messages, api_key=openai.api_key, original_response=response
                )
            elif (
                custom_llm_provider == "together_ai"
                or ("togethercomputer" in model)
                or (model in litellm.together_ai_models)
            ):
                """
                Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
                """
                custom_llm_provider = "together_ai"
                together_ai_key = (
                    api_key
                    or litellm.togetherai_api_key
                    or get_secret("TOGETHER_AI_TOKEN")
                    or get_secret("TOGETHERAI_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("TOGETHERAI_API_BASE")
                    or "https://api.together.xyz/inference"
                )

                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

                model_response = together_ai.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=together_ai_key,
                    logging_obj=logging,
                    custom_prompt_dict=custom_prompt_dict,
                )
                if (
                    "stream_tokens" in optional_params
                    and optional_params["stream_tokens"] == True
                ):
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="together_ai",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "palm":
                palm_api_key = api_key or get_secret("PALM_API_KEY") or litellm.api_key

                # palm does not support streaming as yet :(
                model_response = palm.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=palm_api_key,
                    logging_obj=logging,
                )
                # fake palm streaming
                if "stream" in optional_params and optional_params["stream"] == True:
                    # fake streaming for palm
                    resp_string = model_response["choices"][0]["message"]["content"]
                    response = CustomStreamWrapper(
                        resp_string, model, custom_llm_provider="palm", logging_obj=logging
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "gemini":
                gemini_api_key = (
                    api_key
                    or get_secret("GEMINI_API_KEY")
                    or get_secret("PALM_API_KEY")  # older palm api key should also work
                    or litellm.api_key
                )

                # palm does not support streaming as yet :(
                model_response = gemini.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=gemini_api_key,
                    logging_obj=logging,
                    acompletion=acompletion,
                    custom_prompt_dict=custom_prompt_dict,
                )
                if (
                    "stream" in optional_params
                    and optional_params["stream"] == True
                    and acompletion == False
                ):
                    response = CustomStreamWrapper(
                        iter(model_response),
                        model,
                        custom_llm_provider="gemini",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "vertex_ai":
                vertex_ai_project = (
                    optional_params.pop("vertex_project", None)
                    or optional_params.pop("vertex_ai_project", None)
                    or litellm.vertex_project
                    or get_secret("VERTEXAI_PROJECT")
                )
                vertex_ai_location = (
                    optional_params.pop("vertex_location", None)
                    or optional_params.pop("vertex_ai_location", None)
                    or litellm.vertex_location
                    or get_secret("VERTEXAI_LOCATION")
                )

                if "claude-3" in model:
                    model_response = vertex_ai_anthropic.completion(
                        model=model,
                        messages=messages,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        encoding=encoding,
                        vertex_location=vertex_ai_location,
                        vertex_project=vertex_ai_project,
                        logging_obj=logging,
                        acompletion=acompletion,
                    )
                else:
                    model_response = vertex_ai.completion(
                        model=model,
                        messages=messages,
                        model_response=model_response,
                        print_verbose=print_verbose,
                        optional_params=optional_params,
                        litellm_params=litellm_params,
                        logger_fn=logger_fn,
                        encoding=encoding,
                        vertex_location=vertex_ai_location,
                        vertex_project=vertex_ai_project,
                        logging_obj=logging,
                        acompletion=acompletion,
                    )

                if (
                    "stream" in optional_params
                    and optional_params["stream"] == True
                    and acompletion == False
                ):
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="vertex_ai",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "ai21":
                custom_llm_provider = "ai21"
                ai21_key = (
                    api_key
                    or litellm.ai21_key
                    or os.environ.get("AI21_API_KEY")
                    or litellm.api_key
                )

                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("AI21_API_BASE")
                    or "https://api.ai21.com/studio/v1/"
                )

                model_response = ai21.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=ai21_key,
                    logging_obj=logging,
                )

                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="ai21",
                        logging_obj=logging,
                    )
                    return response

                ## RESPONSE OBJECT
                response = model_response
            elif custom_llm_provider == "sagemaker":
                # boto3 reads keys from .env
                model_response = sagemaker.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    custom_prompt_dict=custom_prompt_dict,
                    hf_model_name=hf_model_name,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    logging_obj=logging,
                    acompletion=acompletion,
                )
                if (
                    "stream" in optional_params and optional_params["stream"] == True
                ):  ## [BETA]
                    print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
                    from .llms.sagemaker import TokenIterator

                    tokenIterator = TokenIterator(model_response, acompletion=acompletion)
                    response = CustomStreamWrapper(
                        completion_stream=tokenIterator,
                        model=model,
                        custom_llm_provider="sagemaker",
                        logging_obj=logging,
                    )
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=None,
                        original_response=response,
                    )
                    return response

                ## RESPONSE OBJECT
                response = model_response
            elif custom_llm_provider == "bedrock":
                # boto3 reads keys from .env
                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
                response = bedrock.completion(
                    model=model,
                    messages=messages,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    logging_obj=logging,
                    timeout=timeout,
                )

                if (
                    "stream" in optional_params
                    and optional_params["stream"] == True
                    and not isinstance(response, CustomStreamWrapper)
                ):
                    # don't try to access stream object,
                    if "ai21" in model:
                        response = CustomStreamWrapper(
                            response,
                            model,
                            custom_llm_provider="bedrock",
                            logging_obj=logging,
                        )
                    else:
                        response = CustomStreamWrapper(
                            iter(response),
                            model,
                            custom_llm_provider="bedrock",
                            logging_obj=logging,
                        )

                if optional_params.get("stream", False):
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=None,
                        original_response=response,
                    )

                ## RESPONSE OBJECT
                response = response
            elif custom_llm_provider == "vllm":
                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
                model_response = vllm.completion(
                    model=model,
                    messages=messages,
                    custom_prompt_dict=custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    logging_obj=logging,
                )

                if (
                    "stream" in optional_params and optional_params["stream"] == True
                ):  ## [BETA]
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="vllm",
                        logging_obj=logging,
                    )
                    return response

                ## RESPONSE OBJECT
                response = model_response
            elif custom_llm_provider == "ollama":
                api_base = (
                    litellm.api_base
                    or api_base
                    or get_secret("OLLAMA_API_BASE")
                    or "http://localhost:11434"
                )
                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
                if model in custom_prompt_dict:
                    # check if the model has a registered custom prompt
                    model_prompt_details = custom_prompt_dict[model]
                    prompt = custom_prompt(
                        role_dict=model_prompt_details["roles"],
                        initial_prompt_value=model_prompt_details["initial_prompt_value"],
                        final_prompt_value=model_prompt_details["final_prompt_value"],
                        messages=messages,
                    )
                else:
                    prompt = prompt_factory(
                        model=model,
                        messages=messages,
                        custom_llm_provider=custom_llm_provider,
                    )
                    if isinstance(prompt, dict):
                        # for multimode models - ollama/llava prompt_factory returns a dict {
                        #     "prompt": prompt,
                        #     "images": images
                        # }
                        prompt, images = prompt["prompt"], prompt["images"]
                        optional_params["images"] = images

                ## LOGGING
                generator = ollama.get_ollama_response(
                    api_base,
                    model,
                    prompt,
                    optional_params,
                    logging_obj=logging,
                    acompletion=acompletion,
                    model_response=model_response,
                    encoding=encoding,
                )
                if acompletion is True or optional_params.get("stream", False) == True:
                    return generator

                response = generator
            elif custom_llm_provider == "ollama_chat":
                api_base = (
                    litellm.api_base
                    or api_base
                    or get_secret("OLLAMA_API_BASE")
                    or "http://localhost:11434"
                )

                ## LOGGING
                generator = ollama_chat.get_ollama_response(
                    api_base,
                    model,
                    messages,
                    optional_params,
                    logging_obj=logging,
                    acompletion=acompletion,
                    model_response=model_response,
                    encoding=encoding,
                )
                if acompletion is True or optional_params.get("stream", False) == True:
                    return generator

                response = generator
            elif custom_llm_provider == "cloudflare":
                api_key = (
                    api_key
                    or litellm.cloudflare_api_key
                    or litellm.api_key
                    or get_secret("CLOUDFLARE_API_KEY")
                )
                account_id = get_secret("CLOUDFLARE_ACCOUNT_ID")
                api_base = (
                    api_base
                    or litellm.api_base
                    or get_secret("CLOUDFLARE_API_BASE")
                    or f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/"
                )

                custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
                response = cloudflare.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,  # for calculating input/output tokens
                    api_key=api_key,
                    logging_obj=logging,
                )
                if "stream" in optional_params and optional_params["stream"] == True:
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        response,
                        model,
                        custom_llm_provider="cloudflare",
                        logging_obj=logging,
                    )

                if optional_params.get("stream", False) or acompletion == True:
                    ## LOGGING
                    logging.post_call(
                        input=messages,
                        api_key=api_key,
                        original_response=response,
                    )
                response = response
            elif (
                custom_llm_provider == "baseten"
                or litellm.api_base == "https://app.baseten.co"
            ):
                custom_llm_provider = "baseten"
                baseten_key = (
                    api_key
                    or litellm.baseten_key
                    or os.environ.get("BASETEN_API_KEY")
                    or litellm.api_key
                )

                model_response = baseten.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    api_key=baseten_key,
                    logging_obj=logging,
                )
                if inspect.isgenerator(model_response) or (
                    "stream" in optional_params and optional_params["stream"] == True
                ):
                    # don't try to access stream object,
                    response = CustomStreamWrapper(
                        model_response,
                        model,
                        custom_llm_provider="baseten",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "petals" or model in litellm.petals_models:
                api_base = api_base or litellm.api_base

                custom_llm_provider = "petals"
                stream = optional_params.pop("stream", False)
                model_response = petals.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=optional_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    logging_obj=logging,
                )
                if stream == True:  ## [BETA]
                    # Fake streaming for petals
                    resp_string = model_response["choices"][0]["message"]["content"]
                    response = CustomStreamWrapper(
                        resp_string,
                        model,
                        custom_llm_provider="petals",
                        logging_obj=logging,
                    )
                    return response
                response = model_response
            elif custom_llm_provider == "custom":
                import requests

                url = litellm.api_base or api_base or ""
                if url == None or url == "":
                    raise ValueError(
                        "api_base not set. Set api_base or litellm.api_base for custom endpoints"
                    )

                """
                assume input to custom LLM api bases follow this format:
                resp = requests.post(
                    api_base,
                    json={
                        'model': 'meta-llama/Llama-2-13b-hf', # model name
                        'params': {
                            'prompt': ["The capital of France is P"],
                            'max_tokens': 32,
                            'temperature': 0.7,
                            'top_p': 1.0,
                            'top_k': 40,
                        }
                    }
                )

                """
                prompt = " ".join([message["content"] for message in messages])  # type: ignore
                resp = requests.post(
                    url,
                    json={
                        "model": model,
                        "params": {
                            "prompt": [prompt],
                            "max_tokens": max_tokens,
                            "temperature": temperature,
                            "top_p": top_p,
                            "top_k": kwargs.get("top_k", 40),
                        },
                    },
                )
                response_json = resp.json()
                """
                assume all responses from custom api_bases of this format:
                {
                    'data': [
                        {
                            'prompt': 'The capital of France is P',
                            'output': ['The capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France'],
                            'params': {'temperature': 0.7, 'top_k': 40, 'top_p': 1}}],
                            'message': 'ok'
                        }
                    ]
                }
                """
                string_response = response_json["data"][0]["output"][0]
                ## RESPONSE OBJECT
                model_response["choices"][0]["message"]["content"] = string_response
                model_response["created"] = int(time.time())
                model_response["model"] = model
                response = model_response
            else:
                raise ValueError(
                    f"Unable to map your input to a model. Check your input - {args}"
                )
            return response
        except Exception as e:
            ## Map to OpenAI Exception
>           raise exception_type(
                model=model,
                custom_llm_provider=custom_llm_provider,
                original_exception=e,
                completion_kwargs=args,
            )

../main.py:2126:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

model = 'meta/llama-2-7b-chat'
original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
custom_llm_provider = 'replicate'
completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}

    def exception_type(
        model,
        original_exception,
        custom_llm_provider,
        completion_kwargs={},
    ):
        global user_logger_fn, liteDebuggerClient
        exception_mapping_worked = False
        if litellm.suppress_debug_info is False:
            print()  # noqa
            print(  # noqa
                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
            )  # noqa
            print(  # noqa
                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
            )  # noqa
            print()  # noqa
        try:
            if model:
                error_str = str(original_exception)
                if isinstance(original_exception, BaseException):
                    exception_type = type(original_exception).__name__
                else:
                    exception_type = ""

                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
                    exception_mapping_worked = True
                    raise Timeout(
                        message=f"APITimeoutError - Request timed out",
                        model=model,
                        llm_provider=custom_llm_provider,
                    )

                if (
                    custom_llm_provider == "openai"
                    or custom_llm_provider == "text-completion-openai"
                    or custom_llm_provider == "custom_openai"
                    or custom_llm_provider in litellm.openai_compatible_providers
                ):
                    # custom_llm_provider is openai, make it OpenAI
                    if hasattr(original_exception, "message"):
                        message = original_exception.message
                    else:
                        message = str(original_exception)
                    if message is not None and isinstance(message, str):
                        message = message.replace("OPENAI", custom_llm_provider.upper())
                        message = message.replace("openai", custom_llm_provider)
                        message = message.replace("OpenAI", custom_llm_provider)
                    if custom_llm_provider == "openai":
                        exception_provider = "OpenAI" + "Exception"
                    else:
                        exception_provider = (
                            custom_llm_provider[0].upper()
                            + custom_llm_provider[1:]
                            + "Exception"
                        )

                    if (
                        "This model's maximum context length is" in error_str
                        or "Request too large" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "model_not_found" in error_str
                    ):
                        exception_mapping_worked = True
                        raise NotFoundError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "content_policy_violation" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContentPolicyViolationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "Incorrect API key provided" not in error_str
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Mistral API raised a streaming error" in error_str:
                        exception_mapping_worked = True
                        _request = httpx.Request(
                            method="POST", url="https://api.openai.com/v1"
                        )
                        raise APIError(
                            status_code=500,
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=_request,
                        )
                    elif hasattr(original_exception, "status_code"):
                        exception_mapping_worked = True
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 404:
                            exception_mapping_worked = True
                            raise NotFoundError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        elif original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 503:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 504:  # gateway timeout error
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                request=original_exception.request,
                            )
                    else:
                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
                        raise APIConnectionError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=httpx.Request(
                                method="POST", url="https://api.openai.com/v1/"
                            ),
                        )
                elif custom_llm_provider == "anthropic":  # one of the anthropics
                    if hasattr(original_exception, "message"):
                        if (
                            "prompt is too long" in original_exception.message
                            or "prompt: length" in original_exception.message
                        ):
                            exception_mapping_worked = True
                            raise ContextWindowExceededError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        if "Invalid API Key" in original_exception.message:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                    if hasattr(original_exception, "status_code"):
                        print_verbose(f"status_code: {original_exception.status_code}")
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=(
                                    original_exception.response
                                    if hasattr(original_exception, "response")
                                    else httpx.Response(
                                        status_code=500,
                                        request=httpx.Request(
                                            method="POST",
                                            url="https://docs.anthropic.com/claude/reference/messages_post",
                                        ),
                                    )
                                ),
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
                                llm_provider="anthropic",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "replicate":
                    if "Incorrect authentication token" in error_str:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "input is too long" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif exception_type == "ModelError":
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif "Request was throttled" in error_str:
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"ReplicateException - {original_exception.message}",
                                llm_provider="replicate",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 422
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"ReplicateException - {original_exception.message}",
                                model=model,
                                llm_provider="replicate",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"ReplicateException - {original_exception.message}",
                                model=model,
                                llm_provider="replicate",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"ReplicateException - {original_exception.message}",
                                llm_provider="replicate",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"ReplicateException - {original_exception.message}",
                                llm_provider="replicate",
                                model=model,
                                response=original_exception.response,
                            )
                    exception_mapping_worked = True
                    raise APIError(
                        status_code=500,
                        message=f"ReplicateException - {str(original_exception)}",
                        llm_provider="replicate",
                        model=model,
                        request=original_exception.request,
                    )
                elif custom_llm_provider == "bedrock":
                    if (
                        "too many tokens" in error_str
                        or "expected maxLength:" in error_str
                        or "Input is too long" in error_str
                        or "prompt: length: 1.." in error_str
                        or "Too many input tokens" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"BedrockException: Context Window Error - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                            response=original_exception.response,
                        )
                    if "Malformed input request" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"BedrockException - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                            response=original_exception.response,
                        )
                    if (
                        "Unable to locate credentials" in error_str
                        or "The security token included in the request is invalid"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"BedrockException Invalid Authentication - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                            response=original_exception.response,
                        )
                    if "AccessDeniedException" in error_str:
                        exception_mapping_worked = True
                        raise PermissionDeniedError(
                            message=f"BedrockException PermissionDeniedError - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                            response=original_exception.response,
                        )
                    if (
                        "throttlingException" in error_str
                        or "ThrottlingException" in error_str
                    ):
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"BedrockException: Rate Limit Error - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                            response=original_exception.response,
                        )
                    if "Connect timeout on endpoint URL" in error_str:
                        exception_mapping_worked = True
                        raise Timeout(
                            message=f"BedrockException: Timeout Error - {error_str}",
                            model=model,
                            llm_provider="bedrock",
                        )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"BedrockException - {original_exception.message}",
                                llm_provider="bedrock",
                                model=model,
                                response=httpx.Response(
                                    status_code=500,
                                    request=httpx.Request(
                                        method="POST", url="https://api.openai.com/v1/"
                                    ),
                                ),
                            )
                        elif original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"BedrockException - {original_exception.message}",
                                llm_provider="bedrock",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 400:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"BedrockException - {original_exception.message}",
                                llm_provider="bedrock",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 404:
                            exception_mapping_worked = True
                            raise NotFoundError(
                                message=f"BedrockException - {original_exception.message}",
                                llm_provider="bedrock",
                                model=model,
                                response=original_exception.response,
                            )
                elif custom_llm_provider == "sagemaker":
                    if "Unable to locate credentials" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"SagemakerException - {error_str}",
                            model=model,
                            llm_provider="sagemaker",
                            response=original_exception.response,
                        )
                    elif (
                        "Input validation error: `best_of` must be > 0 and <= 2"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints",
                            model=model,
                            llm_provider="sagemaker",
                            response=original_exception.response,
                        )
                    elif (
                        "`inputs` tokens + `max_new_tokens` must be <=" in error_str
                        or "instance type with more CPU capacity or memory" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"SagemakerException - {error_str}",
                            model=model,
                            llm_provider="sagemaker",
                            response=original_exception.response,
                        )
                elif custom_llm_provider == "vertex_ai":
                    if (
                        "Vertex AI API has not been used in project" in error_str
                        or "Unable to find your project" in error_str
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"VertexAIException - {error_str}",
                            model=model,
                            llm_provider="vertex_ai",
                            response=original_exception.response,
                        )
                    elif "403" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"VertexAIException - {error_str}",
                            model=model,
                            llm_provider="vertex_ai",
                            response=original_exception.response,
                        )
                    elif "The response was blocked." in error_str:
                        exception_mapping_worked = True
                        raise UnprocessableEntityError(
                            message=f"VertexAIException - {error_str}",
                            model=model,
                            llm_provider="vertex_ai",
                            response=httpx.Response(
                                status_code=429,
                                request=httpx.Request(
                                    method="POST",
                                    url=" https://cloud.google.com/vertex-ai/",
                                ),
                            ),
                        )
                    elif (
                        "429 Quota exceeded" in error_str
                        or "IndexError: list index out of range"
                    ):
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"VertexAIException - {error_str}",
                            model=model,
                            llm_provider="vertex_ai",
                            response=httpx.Response(
                                status_code=429,
                                request=httpx.Request(
                                    method="POST",
                                    url=" https://cloud.google.com/vertex-ai/",
                                ),
                            ),
                        )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 400:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"VertexAIException - {error_str}",
                                model=model,
                                llm_provider="vertex_ai",
                                response=original_exception.response,
                            )
                        if original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise APIError(
                                message=f"VertexAIException - {error_str}",
                                status_code=500,
                                model=model,
                                llm_provider="vertex_ai",
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
                    if "503 Getting metadata" in error_str:
                        # auth errors look like this
                        # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"PalmException - Invalid api key",
                            model=model,
                            llm_provider="palm",
                            response=original_exception.response,
                        )
                    if (
                        "504 Deadline expired before operation could complete." in error_str
                        or "504 Deadline Exceeded" in error_str
                    ):
                        exception_mapping_worked = True
                        raise Timeout(
                            message=f"PalmException - {original_exception.message}",
                            model=model,
                            llm_provider="palm",
                        )
                    if "400 Request payload size exceeds" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"PalmException - {error_str}",
                            model=model,
                            llm_provider="palm",
                            response=original_exception.response,
                        )
                    if "500 An internal error has occurred." in error_str:
                        exception_mapping_worked = True
                        raise APIError(
                            status_code=getattr(original_exception, "status_code", 500),
                            message=f"PalmException - {original_exception.message}",
                            llm_provider="palm",
                            model=model,
                            request=original_exception.request,
                        )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 400:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"PalmException - {error_str}",
                                model=model,
                                llm_provider="palm",
                                response=original_exception.response,
                            )
                    # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
                elif custom_llm_provider == "cloudflare":
                    if "Authentication error" in error_str:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"Cloudflare Exception - {original_exception.message}",
                            llm_provider="cloudflare",
                            model=model,
                            response=original_exception.response,
                        )
                    if "must have required property" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"Cloudflare Exception - {original_exception.message}",
                            llm_provider="cloudflare",
                            model=model,
                            response=original_exception.response,
                        )
                elif (
                    custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat"
                ):  # Cohere
                    if (
                        "invalid api token" in error_str
                        or "No API key provided." in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"CohereException - {original_exception.message}",
                            llm_provider="cohere",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "too many tokens" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"CohereException - {original_exception.message}",
                            model=model,
                            llm_provider="cohere",
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        if (
                            original_exception.status_code == 400
                            or original_exception.status_code == 498
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"CohereException - {original_exception.message}",
                                llm_provider="cohere",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"CohereException - {original_exception.message}",
                                llm_provider="cohere",
                                model=model,
                                response=original_exception.response,
                            )
                    elif (
                        "CohereConnectionError" in exception_type
                    ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"CohereException - {original_exception.message}",
                            llm_provider="cohere",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "invalid type:" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"CohereException - {original_exception.message}",
                            llm_provider="cohere",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Unexpected server error" in error_str:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
                            message=f"CohereException - {original_exception.message}",
                            llm_provider="cohere",
                            model=model,
                            response=original_exception.response,
                        )
                    else:
                        if hasattr(original_exception, "status_code"):
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"CohereException - {original_exception.message}",
                                llm_provider="cohere",
                                model=model,
                                request=original_exception.request,
                            )
                        raise original_exception
                elif custom_llm_provider == "huggingface":
                    if "length limit exceeded" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=error_str,
                            model=model,
                            llm_provider="huggingface",
                            response=original_exception.response,
                        )
                    elif "A valid user token is required" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=error_str,
                            llm_provider="huggingface",
                            model=model,
                            response=original_exception.response,
                        )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"HuggingfaceException - {original_exception.message}",
                                llm_provider="huggingface",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 400:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"HuggingfaceException - {original_exception.message}",
                                model=model,
                                llm_provider="huggingface",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"HuggingfaceException - {original_exception.message}",
                                model=model,
                                llm_provider="huggingface",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"HuggingfaceException - {original_exception.message}",
                                llm_provider="huggingface",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 503:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"HuggingfaceException - {original_exception.message}",
                                llm_provider="huggingface",
                                model=model,
                                response=original_exception.response,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"HuggingfaceException - {original_exception.message}",
                                llm_provider="huggingface",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "ai21":
                    if hasattr(original_exception, "message"):
                        if "Prompt has too many tokens" in original_exception.message:
                            exception_mapping_worked = True
                            raise ContextWindowExceededError(
                                message=f"AI21Exception - {original_exception.message}",
                                model=model,
                                llm_provider="ai21",
                                response=original_exception.response,
                            )
                        if "Bad or missing API token." in original_exception.message:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AI21Exception - {original_exception.message}",
                                model=model,
                                llm_provider="ai21",
                                response=original_exception.response,
                            )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AI21Exception - {original_exception.message}",
                                llm_provider="ai21",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AI21Exception - {original_exception.message}",
                                model=model,
                                llm_provider="ai21",
                                request=original_exception.request,
                            )
                        if original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AI21Exception - {original_exception.message}",
                                model=model,
                                llm_provider="ai21",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AI21Exception - {original_exception.message}",
                                llm_provider="ai21",
                                model=model,
                                response=original_exception.response,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"AI21Exception - {original_exception.message}",
                                llm_provider="ai21",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "nlp_cloud":
                    if "detail" in error_str:
                        if "Input text length should not exceed" in error_str:
                            exception_mapping_worked = True
                            raise ContextWindowExceededError(
                                message=f"NLPCloudException - {error_str}",
                                model=model,
                                llm_provider="nlp_cloud",
                                response=original_exception.response,
                            )
                        elif "value is not a valid" in error_str:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"NLPCloudException - {error_str}",
                                model=model,
                                llm_provider="nlp_cloud",
                                response=original_exception.response,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=500,
                                message=f"NLPCloudException - {error_str}",
                                model=model,
                                llm_provider="nlp_cloud",
                                request=original_exception.request,
                            )
                    if hasattr(
                        original_exception, "status_code"
                    ):  # https://docs.nlpcloud.com/?shell#errors
                        if (
                            original_exception.status_code == 400
                            or original_exception.status_code == 406
                            or original_exception.status_code == 413
                            or original_exception.status_code == 422
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"NLPCloudException - {original_exception.message}",
                                llm_provider="nlp_cloud",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 401
                            or original_exception.status_code == 403
                        ):
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"NLPCloudException - {original_exception.message}",
                                llm_provider="nlp_cloud",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 522
                            or original_exception.status_code == 524
                        ):
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"NLPCloudException - {original_exception.message}",
                                model=model,
                                llm_provider="nlp_cloud",
                                request=original_exception.request,
                            )
                        elif (
                            original_exception.status_code == 429
                            or original_exception.status_code == 402
                        ):
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"NLPCloudException - {original_exception.message}",
                                llm_provider="nlp_cloud",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 500
                            or original_exception.status_code == 503
                        ):
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"NLPCloudException - {original_exception.message}",
                                llm_provider="nlp_cloud",
                                model=model,
                                request=original_exception.request,
                            )
                        elif (
                            original_exception.status_code == 504
                            or original_exception.status_code == 520
                        ):
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"NLPCloudException - {original_exception.message}",
                                model=model,
                                llm_provider="nlp_cloud",
                                response=original_exception.response,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"NLPCloudException - {original_exception.message}",
                                llm_provider="nlp_cloud",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "together_ai":
                    import json

                    try:
                        error_response = json.loads(error_str)
                    except:
                        error_response = {"error": error_str}
                    if (
                        "error" in error_response
                        and "`inputs` tokens + `max_new_tokens` must be <="
                        in error_response["error"]
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"TogetherAIException - {error_response['error']}",
                            model=model,
                            llm_provider="together_ai",
                            response=original_exception.response,
                        )
                    elif (
                        "error" in error_response
                        and "invalid private key" in error_response["error"]
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"TogetherAIException - {error_response['error']}",
                            llm_provider="together_ai",
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "error" in error_response
                        and "INVALID_ARGUMENT" in error_response["error"]
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"TogetherAIException - {error_response['error']}",
                            model=model,
                            llm_provider="together_ai",
                            response=original_exception.response,
                        )

                    elif (
                        "error" in error_response
                        and "API key doesn't match expected format."
                        in error_response["error"]
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"TogetherAIException - {error_response['error']}",
                            model=model,
                            llm_provider="together_ai",
                            response=original_exception.response,
                        )
                    elif (
                        "error_type" in error_response
                        and error_response["error_type"] == "validation"
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"TogetherAIException - {error_response['error']}",
                            model=model,
                            llm_provider="together_ai",
                            response=original_exception.response,
                        )
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"TogetherAIException - {original_exception.message}",
                                model=model,
                                llm_provider="together_ai",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"TogetherAIException - {error_response['error']}",
                                model=model,
                                llm_provider="together_ai",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"TogetherAIException - {original_exception.message}",
                                llm_provider="together_ai",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 524:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"TogetherAIException - {original_exception.message}",
                                llm_provider="together_ai",
                                model=model,
                            )
                    else:
                        exception_mapping_worked = True
                        raise APIError(
                            status_code=original_exception.status_code,
                            message=f"TogetherAIException - {original_exception.message}",
                            llm_provider="together_ai",
                            model=model,
                            request=original_exception.request,
                        )
                elif custom_llm_provider == "aleph_alpha":
                    if (
                        "This is longer than the model's maximum context length"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"AlephAlphaException - {original_exception.message}",
                            llm_provider="aleph_alpha",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "InvalidToken" in error_str or "No token provided" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"AlephAlphaException - {original_exception.message}",
                            llm_provider="aleph_alpha",
                            model=model,
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        print_verbose(f"status code: {original_exception.status_code}")
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AlephAlphaException - {original_exception.message}",
                                llm_provider="aleph_alpha",
                                model=model,
                            )
                        elif original_exception.status_code == 400:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AlephAlphaException - {original_exception.message}",
                                llm_provider="aleph_alpha",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AlephAlphaException - {original_exception.message}",
                                llm_provider="aleph_alpha",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"AlephAlphaException - {original_exception.message}",
                                llm_provider="aleph_alpha",
                                model=model,
                                response=original_exception.response,
                            )
                        raise original_exception
                    raise original_exception
                elif (
                    custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat"
                ):
                    if isinstance(original_exception, dict):
                        error_str = original_exception.get("error", "")
                    else:
                        error_str = str(original_exception)
                    if "no such file or directory" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
                            model=model,
                            llm_provider="ollama",
                            response=original_exception.response,
                        )
                    elif "Failed to establish a new connection" in error_str:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
                            message=f"OllamaException: {original_exception}",
                            llm_provider="ollama",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Invalid response object from API" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"OllamaException: {original_exception}",
                            llm_provider="ollama",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Read timed out" in error_str:
                        exception_mapping_worked = True
                        raise Timeout(
                            message=f"OllamaException: {original_exception}",
                            llm_provider="ollama",
                            model=model,
                        )
                elif custom_llm_provider == "vllm":
                    if hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 0:
                            exception_mapping_worked = True
                            raise APIConnectionError(
                                message=f"VLLMException - {original_exception.message}",
                                llm_provider="vllm",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "azure":
                    if "This model's maximum context length is" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"AzureException - {original_exception.message}",
                            llm_provider="azure",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "DeploymentNotFound" in error_str:
                        exception_mapping_worked = True
                        raise NotFoundError(
                            message=f"AzureException - {original_exception.message}",
                            llm_provider="azure",
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "content_policy_violation" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContentPolicyViolationError(
                            message=f"AzureException - {original_exception.message}",
                            llm_provider="azure",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "invalid_request_error" in error_str:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"AzureException - {original_exception.message}",
                            llm_provider="azure",
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "The api_key client option must be set either by passing api_key to the client or by setting"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"{exception_provider} - {original_exception.message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        exception_mapping_worked = True
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AzureException - {original_exception.message}",
                                llm_provider="azure",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AzureException - {original_exception.message}",
                                model=model,
                                llm_provider="azure",
                                request=original_exception.request,
                            )
                        if original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AzureException - {original_exception.message}",
                                model=model,
                                llm_provider="azure",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AzureException - {original_exception.message}",
                                model=model,
                                llm_provider="azure",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 503:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"AzureException - {original_exception.message}",
                                model=model,
                                llm_provider="azure",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 504:  # gateway timeout error
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AzureException - {original_exception.message}",
                                model=model,
                                llm_provider="azure",
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"AzureException - {original_exception.message}",
                                llm_provider="azure",
                                model=model,
                                request=httpx.Request(
                                    method="POST", url="https://openai.com/"
                                ),
                            )
                    else:
                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
                        raise APIConnectionError(
                            message=f"{exception_provider} - {message}",
                            llm_provider="azure",
                            model=model,
                            request=httpx.Request(method="POST", url="https://openai.com/"),
                        )
            if (
                "BadRequestError.__init__() missing 1 required positional argument: 'param'"
                in str(original_exception)
            ):  # deal with edge-case invalid request error bug in openai-python sdk
                exception_mapping_worked = True
                raise BadRequestError(
                    message=f"{exception_provider}: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}",
                    model=model,
                    llm_provider=custom_llm_provider,
                    response=original_exception.response,
                )
            else:  # ensure generic errors always return APIConnectionError=
                exception_mapping_worked = True
                if hasattr(original_exception, "request"):
                    raise APIConnectionError(
                        message=f"{str(original_exception)}",
                        llm_provider=custom_llm_provider,
                        model=model,
                        request=original_exception.request,
                    )
                else:
                    raise APIConnectionError(
                        message=f"{str(original_exception)}",
                        llm_provider=custom_llm_provider,
                        model=model,
                        request=httpx.Request(
                            method="POST", url="https://api.openai.com/v1/"
                        ),  # stub the request
                    )
        except Exception as e:
            # LOGGING
            exception_logging(
                logger_fn=user_logger_fn,
                additional_args={
                    "exception_mapping_worked": exception_mapping_worked,
                    "original_exception": original_exception,
                },
                exception=e,
            )
            ## AUTH ERROR
            if isinstance(e, AuthenticationError) and (
                litellm.email or "LITELLM_EMAIL" in os.environ
            ):
                threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
            # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
            if exception_mapping_worked:
>               raise e

../utils.py:8533:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

model = 'meta/llama-2-7b-chat'
original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
custom_llm_provider = 'replicate'
completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}

    def exception_type(
        model,
        original_exception,
        custom_llm_provider,
        completion_kwargs={},
    ):
        global user_logger_fn, liteDebuggerClient
        exception_mapping_worked = False
        if litellm.suppress_debug_info is False:
            print()  # noqa
            print(  # noqa
                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
            )  # noqa
            print(  # noqa
                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
            )  # noqa
            print()  # noqa
        try:
            if model:
                error_str = str(original_exception)
                if isinstance(original_exception, BaseException):
                    exception_type = type(original_exception).__name__
                else:
                    exception_type = ""

                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
                    exception_mapping_worked = True
                    raise Timeout(
                        message=f"APITimeoutError - Request timed out",
                        model=model,
                        llm_provider=custom_llm_provider,
                    )

                if (
                    custom_llm_provider == "openai"
                    or custom_llm_provider == "text-completion-openai"
                    or custom_llm_provider == "custom_openai"
                    or custom_llm_provider in litellm.openai_compatible_providers
                ):
                    # custom_llm_provider is openai, make it OpenAI
                    if hasattr(original_exception, "message"):
                        message = original_exception.message
                    else:
                        message = str(original_exception)
                    if message is not None and isinstance(message, str):
                        message = message.replace("OPENAI", custom_llm_provider.upper())
                        message = message.replace("openai", custom_llm_provider)
                        message = message.replace("OpenAI", custom_llm_provider)
                    if custom_llm_provider == "openai":
                        exception_provider = "OpenAI" + "Exception"
                    else:
                        exception_provider = (
                            custom_llm_provider[0].upper()
                            + custom_llm_provider[1:]
                            + "Exception"
                        )

                    if (
                        "This model's maximum context length is" in error_str
                        or "Request too large" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "model_not_found" in error_str
                    ):
                        exception_mapping_worked = True
                        raise NotFoundError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "content_policy_violation" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContentPolicyViolationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "Incorrect API key provided" not in error_str
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Mistral API raised a streaming error" in error_str:
                        exception_mapping_worked = True
                        _request = httpx.Request(
                            method="POST", url="https://api.openai.com/v1"
                        )
                        raise APIError(
                            status_code=500,
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=_request,
                        )
                    elif hasattr(original_exception, "status_code"):
                        exception_mapping_worked = True
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 404:
                            exception_mapping_worked = True
                            raise NotFoundError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        elif original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 503:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 504:  # gateway timeout error
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                request=original_exception.request,
                            )
                    else:
                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
                        raise APIConnectionError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=httpx.Request(
                                method="POST", url="https://api.openai.com/v1/"
                            ),
                        )
                elif custom_llm_provider == "anthropic":  # one of the anthropics
                    if hasattr(original_exception, "message"):
                        if (
                            "prompt is too long" in original_exception.message
                            or "prompt: length" in original_exception.message
                        ):
                            exception_mapping_worked = True
                            raise ContextWindowExceededError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        if "Invalid API Key" in original_exception.message:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                    if hasattr(original_exception, "status_code"):
                        print_verbose(f"status_code: {original_exception.status_code}")
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=(
                                    original_exception.response
                                    if hasattr(original_exception, "response")
                                    else httpx.Response(
                                        status_code=500,
                                        request=httpx.Request(
                                            method="POST",
                                            url="https://docs.anthropic.com/claude/reference/messages_post",
                                        ),
                                    )
                                ),
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
                                llm_provider="anthropic",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "replicate":
                    if "Incorrect authentication token" in error_str:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "input is too long" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif exception_type == "ModelError":
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif "Request was throttled" in error_str:
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"ReplicateException - {original_exception.message}",
                                llm_provider="replicate",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 422
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
>                           raise BadRequestError(
                                message=f"ReplicateException - {original_exception.message}",
                                model=model,
                                llm_provider="replicate",
                                response=original_exception.response,
E                               litellm.exceptions.BadRequestError: ReplicateException - Error: Traceback (most recent call last):
E                                 2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                                 1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                                 0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                                 File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!,
E                               Replicate logs:MLC is currently not using any LoRAs.
E                               MLC: True
E                               Your formatted prompt is:
E                               [INST] <<SYS>>
E                               You are a helpful, respectful and honest assistant.
E                               <</SYS>>
E                               You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
E                               Not using LoRA
E                               Traceback (most recent call last):
E                               File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
E                               for r in result:
E                               File "/src/predict.py", line 198, in predict
E                               for decoded_token in self.engine(
E                               File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
E                               for val in gen:
E                               File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
E                               self.cm.reset_chat(chat_config)
E                               File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
E                               self._load_json_override_func(user_chat_config_json_str, True)
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
E                               File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
E                               File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
E                               raise py_err
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                               tvm._ffi.base.TVMError: Traceback (most recent call last):
E                               2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                               1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                               0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!

../utils.py:7547: BadRequestError

During handling of the above exception, another exception occurred:

    def test_replicate_custom_prompt_dict():
        litellm.set_verbose = True
        model_name = "replicate/meta/llama-2-7b-chat"
        litellm.register_prompt_template(
            model="replicate/meta/llama-2-7b-chat",
            initial_prompt_value="You are a good assistant",  # [OPTIONAL]
            roles={
                "system": {
                    "pre_message": "[INST] <<SYS>>\n",  # [OPTIONAL]
                    "post_message": "\n<</SYS>>\n [/INST]\n",  # [OPTIONAL]
                },
                "user": {
                    "pre_message": "[INST] ",  # [OPTIONAL]
                    "post_message": " [/INST]",  # [OPTIONAL]
                },
                "assistant": {
                    "pre_message": "\n",  # [OPTIONAL]
                    "post_message": "\n",  # [OPTIONAL]
                },
            },
            final_prompt_value="Now answer as best you can:",  # [OPTIONAL]
        )
>       response = completion(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": "what is yc write 1 paragraph",
                }
            ],
            repetition_penalty=0.1,
            num_retries=3,
        )

test_completion.py:1655:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../utils.py:2920: in wrapper
    return litellm.completion_with_retries(*args, **kwargs)
../main.py:2158: in completion_with_retries
    return retryer(original_function, *args, **kwargs)
/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:379: in __call__
    do = self.iter(retry_state=retry_state)
/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:325: in iter
    raise retry_exc.reraise()
/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:158: in reraise
    raise self.last_attempt.result()
/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py:449: in result
    return self.__get_result()
/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py:401: in __get_result
    raise self._exception
/opt/homebrew/lib/python3.11/site-packages/tenacity/__init__.py:382: in __call__
    result = fn(*args, **kwargs)
../utils.py:2948: in wrapper
    raise e
../utils.py:2846: in wrapper
    result = original_function(*args, **kwargs)
../main.py:2126: in completion
    raise exception_type(
../utils.py:8533: in exception_type
    raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

model = 'meta/llama-2-7b-chat'
original_exception = ReplicateError('Error: Traceback (most recent call last):\n  2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::Str...87\nTVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!\n')
custom_llm_provider = 'replicate'
completion_kwargs = {'acompletion': False, 'api_base': 'https://api.replicate.com/v1', 'api_key': None, 'api_version': None, ...}

    def exception_type(
        model,
        original_exception,
        custom_llm_provider,
        completion_kwargs={},
    ):
        global user_logger_fn, liteDebuggerClient
        exception_mapping_worked = False
        if litellm.suppress_debug_info is False:
            print()  # noqa
            print(  # noqa
                "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m"  # noqa
            )  # noqa
            print(  # noqa
                "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'."  # noqa
            )  # noqa
            print()  # noqa
        try:
            if model:
                error_str = str(original_exception)
                if isinstance(original_exception, BaseException):
                    exception_type = type(original_exception).__name__
                else:
                    exception_type = ""

                if "Request Timeout Error" in error_str or "Request timed out" in error_str:
                    exception_mapping_worked = True
                    raise Timeout(
                        message=f"APITimeoutError - Request timed out",
                        model=model,
                        llm_provider=custom_llm_provider,
                    )

                if (
                    custom_llm_provider == "openai"
                    or custom_llm_provider == "text-completion-openai"
                    or custom_llm_provider == "custom_openai"
                    or custom_llm_provider in litellm.openai_compatible_providers
                ):
                    # custom_llm_provider is openai, make it OpenAI
                    if hasattr(original_exception, "message"):
                        message = original_exception.message
                    else:
                        message = str(original_exception)
                    if message is not None and isinstance(message, str):
                        message = message.replace("OPENAI", custom_llm_provider.upper())
                        message = message.replace("openai", custom_llm_provider)
                        message = message.replace("OpenAI", custom_llm_provider)
                    if custom_llm_provider == "openai":
                        exception_provider = "OpenAI" + "Exception"
                    else:
                        exception_provider = (
                            custom_llm_provider[0].upper()
                            + custom_llm_provider[1:]
                            + "Exception"
                        )

                    if (
                        "This model's maximum context length is" in error_str
                        or "Request too large" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "model_not_found" in error_str
                    ):
                        exception_mapping_worked = True
                        raise NotFoundError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "content_policy_violation" in error_str
                    ):
                        exception_mapping_worked = True
                        raise ContentPolicyViolationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "invalid_request_error" in error_str
                        and "Incorrect API key provided" not in error_str
                    ):
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif (
                        "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable"
                        in error_str
                    ):
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            response=original_exception.response,
                        )
                    elif "Mistral API raised a streaming error" in error_str:
                        exception_mapping_worked = True
                        _request = httpx.Request(
                            method="POST", url="https://api.openai.com/v1"
                        )
                        raise APIError(
                            status_code=500,
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=_request,
                        )
                    elif hasattr(original_exception, "status_code"):
                        exception_mapping_worked = True
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 404:
                            exception_mapping_worked = True
                            raise NotFoundError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        elif original_exception.status_code == 422:
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 503:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 504:  # gateway timeout error
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"{exception_provider} - {message}",
                                model=model,
                                llm_provider=custom_llm_provider,
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"{exception_provider} - {message}",
                                llm_provider=custom_llm_provider,
                                model=model,
                                request=original_exception.request,
                            )
                    else:
                        # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
                        raise APIConnectionError(
                            message=f"{exception_provider} - {message}",
                            llm_provider=custom_llm_provider,
                            model=model,
                            request=httpx.Request(
                                method="POST", url="https://api.openai.com/v1/"
                            ),
                        )
                elif custom_llm_provider == "anthropic":  # one of the anthropics
                    if hasattr(original_exception, "message"):
                        if (
                            "prompt is too long" in original_exception.message
                            or "prompt: length" in original_exception.message
                        ):
                            exception_mapping_worked = True
                            raise ContextWindowExceededError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        if "Invalid API Key" in original_exception.message:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=original_exception.message,
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                    if hasattr(original_exception, "status_code"):
                        print_verbose(f"status_code: {original_exception.status_code}")
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
                            raise BadRequestError(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 408:
                            exception_mapping_worked = True
                            raise Timeout(
                                message=f"AnthropicException - {original_exception.message}",
                                model=model,
                                llm_provider="anthropic",
                                request=original_exception.request,
                            )
                        elif original_exception.status_code == 429:
                            exception_mapping_worked = True
                            raise RateLimitError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=original_exception.response,
                            )
                        elif original_exception.status_code == 500:
                            exception_mapping_worked = True
                            raise ServiceUnavailableError(
                                message=f"AnthropicException - {original_exception.message}",
                                llm_provider="anthropic",
                                model=model,
                                response=(
                                    original_exception.response
                                    if hasattr(original_exception, "response")
                                    else httpx.Response(
                                        status_code=500,
                                        request=httpx.Request(
                                            method="POST",
                                            url="https://docs.anthropic.com/claude/reference/messages_post",
                                        ),
                                    )
                                ),
                            )
                        else:
                            exception_mapping_worked = True
                            raise APIError(
                                status_code=original_exception.status_code,
                                message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.",
                                llm_provider="anthropic",
                                model=model,
                                request=original_exception.request,
                            )
                elif custom_llm_provider == "replicate":
                    if "Incorrect authentication token" in error_str:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif "input is too long" in error_str:
                        exception_mapping_worked = True
                        raise ContextWindowExceededError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif exception_type == "ModelError":
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"ReplicateException - {error_str}",
                            model=model,
                            llm_provider="replicate",
                            response=original_exception.response,
                        )
                    elif "Request was throttled" in error_str:
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"ReplicateException - {error_str}",
                            llm_provider="replicate",
                            model=model,
                            response=original_exception.response,
                        )
                    elif hasattr(original_exception, "status_code"):
                        if original_exception.status_code == 401:
                            exception_mapping_worked = True
                            raise AuthenticationError(
                                message=f"ReplicateException - {original_exception.message}",
                                llm_provider="replicate",
                                model=model,
                                response=original_exception.response,
                            )
                        elif (
                            original_exception.status_code == 400
                            or original_exception.status_code == 422
                            or original_exception.status_code == 413
                        ):
                            exception_mapping_worked = True
>                           raise BadRequestError(
                                message=f"ReplicateException - {original_exception.message}",
                                model=model,
                                llm_provider="replicate",
                                response=original_exception.response,
E                               litellm.exceptions.BadRequestError: ReplicateException - Error: Traceback (most recent call last):
E                                 2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                                 1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                                 0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                                       at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                                 File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!,
E                               Replicate logs:MLC is currently not using any LoRAs.
E                               MLC: True
E                               Your formatted prompt is:
E                               [INST] <<SYS>>
E                               You are a helpful, respectful and honest assistant.
E                               <</SYS>>
E                               You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can: [/INST]
E                               Not using LoRA
E                               Traceback (most recent call last):
E                               File "/usr/local/lib/python3.11/site-packages/cog/server/worker.py", line 222, in _predict
E                               for r in result:
E                               File "/src/predict.py", line 198, in predict
E                               for decoded_token in self.engine(
E                               File "/src/src/inference_engines/mlc_vllm_engine.py", line 86, in __call__
E                               for val in gen:
E                               File "/src/src/inference_engines/mlc_engine.py", line 151, in __call__
E                               self.cm.reset_chat(chat_config)
E                               File "/usr/local/lib/python3.11/site-packages/mlc_chat/chat_module.py", line 820, in reset_chat
E                               self._load_json_override_func(user_chat_config_json_str, True)
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
E                               File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
E                               File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
E                               File "/usr/local/lib/python3.11/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
E                               raise py_err
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 1545, in mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 483, in mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387, in mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                               tvm._ffi.base.TVMError: Traceback (most recent call last):
E                               2: mlc::llm::LLMChatModule::GetFunction(tvm::runtime::String const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#10}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:1545
E                               1: mlc::llm::LLMChat::LoadJSONOverride(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:483
E                               0: mlc::llm::LLMChat::LoadJSONOverride(picojson::value const&, bool)
E                               at /workspace/mlc-llm/cpp/llm_chat.cc:387
E                               File "/workspace/mlc-llm/cpp/llm_chat.cc", line 387
E                               TVMError: Check failed: (this->repetition_penalty_ > 0) is false: Repetition penalty must be a positive number!

../utils.py:7547: BadRequestError
---------------------------- Captured stdout setup -----------------------------
<module 'litellm' from '/Users/krrishdholakia/Documents/litellm/litellm/__init__.py'>

pytest fixture - resetting callbacks
----------------------------- Captured stdout call -----------------------------


[92mRequest to litellm:[0m
[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, num_retries=3)[0m


self.optional_params: {}
kwargs[caching]: False; litellm.cache: None
Final returned optional params: {'repetition_penalty': 0.1}
self.optional_params: {'repetition_penalty': 0.1}
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
[0m

https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg
replicate: polling endpoint: https://api.replicate.com/v1/predictions/h5hsyznscnrgm0cers4v4g46qg
Non-streamed output:

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Logging Details: logger_fn - None | callable(logger_fn) - False


[92mRequest to litellm:[0m
[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m


kwargs[caching]: False; litellm.cache: None
Final returned optional params: {'repetition_penalty': 0.1}
self.optional_params: {'repetition_penalty': 0.1}
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
[0m

https://api.replicate.com/v1/predictions/5a7rh5dx6xrgm0cers4t4gad2m
replicate: polling endpoint: https://api.replicate.com/v1/predictions/5a7rh5dx6xrgm0cers4t4gad2m
Non-streamed output:

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Logging Details: logger_fn - None | callable(logger_fn) - False
Logging Details LiteLLM-Failure Call
self.failure_callback: []


[92mRequest to litellm:[0m
[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m


kwargs[caching]: False; litellm.cache: None
Final returned optional params: {'repetition_penalty': 0.1}
self.optional_params: {'repetition_penalty': 0.1}
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
[0m

https://api.replicate.com/v1/predictions/fdx5mgp0tnrgj0cers4r6taf9c
replicate: polling endpoint: https://api.replicate.com/v1/predictions/fdx5mgp0tnrgj0cers4r6taf9c
Non-streamed output:

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Logging Details: logger_fn - None | callable(logger_fn) - False
Logging Details LiteLLM-Failure Call
self.failure_callback: []


[92mRequest to litellm:[0m
[92mlitellm.completion(model='replicate/meta/llama-2-7b-chat', messages=[{'role': 'user', 'content': 'what is yc write 1 paragraph'}], repetition_penalty=0.1, litellm_call_id='85a47e72-fb66-4654-85d4-6b34fbf52a0e', litellm_logging_obj=<litellm.utils.Logging object at 0x1043e1550>)[0m


kwargs[caching]: False; litellm.cache: None
Final returned optional params: {'repetition_penalty': 0.1}
self.optional_params: {'repetition_penalty': 0.1}
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.replicate.com/v1/models/meta/llama-2-7b-chat \
-H 'Authorization: Token r8_KkH9pMk1MOj0GTBij********************' -H 'Content-Type: application/json' \
-d '{'version': 'meta/llama-2-7b-chat', 'input': {'prompt': 'You are a good assistant[INST] what is yc write 1 paragraph [/INST]Now answer as best you can:', 'repetition_penalty': 0.1}}'
[0m

https://api.replicate.com/v1/predictions/1772b6y4qxrgp0cers4s0adhpr
replicate: polling endpoint: https://api.replicate.com/v1/predictions/1772b6y4qxrgp0cers4s0adhpr
Non-streamed output:

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Logging Details: logger_fn - None | callable(logger_fn) - False
Logging Details LiteLLM-Failure Call
self.failure_callback: []
=============================== warnings summary ===============================
../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: 18 warnings
  /opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)

../proxy/_types.py:167
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:167: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:254
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:254: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    extra = Extra.allow  # Allow extra fields

../proxy/_types.py:257
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:257: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:286
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:286: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:333
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:333: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:399
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:399: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:411
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:411: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:451
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:451: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:477
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:477: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:740
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:740: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:763
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:763: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../proxy/_types.py:782
  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:782: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
    @root_validator(pre=True)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:121
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:121: DeprecationWarning: pkg_resources is deprecated as an API
    warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: 10 warnings
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.cloud')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2349: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(parent)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.logging')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.iam')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('mpl_toolkits')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('sphinxcontrib')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870
  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2870: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('zope')`.
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_namespace(pkg)

../llms/prompt_templates/factory.py:6
  /Users/krrishdholakia/Documents/litellm/litellm/llms/prompt_templates/factory.py:6: DeprecationWarning: 'imghdr' is deprecated and slated for removal in Python 3.13
    import imghdr, base64

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
=========================== short test summary info ============================
FAILED test_completion.py::test_replicate_custom_prompt_dict - litellm.except...
======================== 1 failed, 56 warnings in 4.33s ========================