Merge branch 'main' into litellm_model_id_fix

2024-05-17 22:36:17 -07:00 · 2024-05-17 22:36:17 -07:00 · 5e5179e476
commit 5e5179e476
parent a75b865ebc 5d3fe52a08
31 changed files with 710 additions and 7234 deletions
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -93,6 +93,7 @@ class LangFuseLogger:
            )

            litellm_params = kwargs.get("litellm_params", {})
+            litellm_call_id = kwargs.get("litellm_call_id", None)
            metadata = (
                litellm_params.get("metadata", {}) or {}
            )  # if litellm_params['metadata'] == None
@ -161,6 +162,7 @@ class LangFuseLogger:
                    response_obj,
                    level,
                    print_verbose,
+                    litellm_call_id,
                )
            elif response_obj is not None:
                self._log_langfuse_v1(
@ -255,6 +257,7 @@ class LangFuseLogger:
        response_obj,
        level,
        print_verbose,
+        litellm_call_id,
    ) -> tuple:
        import langfuse

@ -318,7 +321,7 @@ class LangFuseLogger:

            session_id = clean_metadata.pop("session_id", None)
            trace_name = clean_metadata.pop("trace_name", None)
-            trace_id = clean_metadata.pop("trace_id", None)
+            trace_id = clean_metadata.pop("trace_id", litellm_call_id)
            existing_trace_id = clean_metadata.pop("existing_trace_id", None)
            update_trace_keys = clean_metadata.pop("update_trace_keys", [])
            debug = clean_metadata.pop("debug_langfuse", None)
@ -351,9 +354,13 @@ class LangFuseLogger:

                # Special keys that are found in the function arguments and not the metadata
                if "input" in update_trace_keys:
-                    trace_params["input"] = input if not mask_input else "redacted-by-litellm"
+                    trace_params["input"] = (
+                        input if not mask_input else "redacted-by-litellm"
+                    )
                if "output" in update_trace_keys:
-                    trace_params["output"] = output if not mask_output else "redacted-by-litellm"
+                    trace_params["output"] = (
+                        output if not mask_output else "redacted-by-litellm"
+                    )
            else:  # don't overwrite an existing trace
                trace_params = {
                    "id": trace_id,
@ -375,7 +382,9 @@ class LangFuseLogger:
                if level == "ERROR":
                    trace_params["status_message"] = output
                else:
-                    trace_params["output"] = output if not mask_output else "redacted-by-litellm"
+                    trace_params["output"] = (
+                        output if not mask_output else "redacted-by-litellm"
+                    )

            if debug == True or (isinstance(debug, str) and debug.lower() == "true"):
                if "metadata" in trace_params:
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -164,13 +164,28 @@ class SlackAlerting(CustomLogger):
    ) -> Optional[str]:
        """
        Returns langfuse trace url
+
+        - check:
+        -> existing_trace_id
+        -> trace_id
+        -> litellm_call_id
        """
        # do nothing for now
-        if (
-            request_data is not None
-            and request_data.get("metadata", {}).get("trace_id", None) is not None
-        ):
-            trace_id = request_data["metadata"]["trace_id"]
+        if request_data is not None:
+            trace_id = None
+            if (
+                request_data.get("metadata", {}).get("existing_trace_id", None)
+                is not None
+            ):
+                trace_id = request_data["metadata"]["existing_trace_id"]
+            elif request_data.get("metadata", {}).get("trace_id", None) is not None:
+                trace_id = request_data["metadata"]["trace_id"]
+            elif request_data.get("litellm_logging_obj", None) is not None and hasattr(
+                request_data["litellm_logging_obj"], "model_call_details"
+            ):
+                trace_id = request_data["litellm_logging_obj"].model_call_details[
+                    "litellm_call_id"
+                ]
            if litellm.utils.langFuseLogger is not None:
                base_url = litellm.utils.langFuseLogger.Langfuse.base_url
                return f"{base_url}/trace/{trace_id}"
@ -671,11 +686,19 @@ class SlackAlerting(CustomLogger):
                )
                await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
            return
-
        return

-    async def model_added_alert(self, model_name: str, litellm_model_name: str):
-        model_info = litellm.model_cost.get(litellm_model_name, {})
+    async def model_added_alert(
+        self, model_name: str, litellm_model_name: str, passed_model_info: Any
+    ):
+        base_model_from_user = getattr(passed_model_info, "base_model", None)
+        model_info = {}
+        base_model = ""
+        if base_model_from_user is not None:
+            model_info = litellm.model_cost.get(base_model_from_user, {})
+            base_model = f"Base Model: `{base_model_from_user}`\n"
+        else:
+            model_info = litellm.model_cost.get(litellm_model_name, {})
        model_info_str = ""
        for k, v in model_info.items():
            if k == "input_cost_per_token" or k == "output_cost_per_token":
@ -687,6 +710,7 @@ class SlackAlerting(CustomLogger):
        message = f"""
 *🚅 New Model Added*
 Model Name: `{model_name}`
+{base_model}

 Usage OpenAI Python SDK:
 ```
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -21,7 +21,7 @@ class BaseLLM:
        messages: list,
        print_verbose,
        encoding,
-    ) -> litellm.utils.ModelResponse:
+    ) -> Union[litellm.utils.ModelResponse, litellm.utils.CustomStreamWrapper]:
        """
        Helper function to process the response across sync + async completion calls
        """
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -1,6 +1,6 @@
 # What is this?
 ## Initial implementation of calling bedrock via httpx client (allows for async calls).
-## V0 - just covers cohere command-r support
+## V1 - covers cohere + anthropic claude-3 support

 import os, types
 import json
@ -29,12 +29,20 @@ from litellm.utils import (
    get_secret,
    Logging,
 )
-import litellm
-from .prompt_templates.factory import prompt_factory, custom_prompt, cohere_message_pt
+import litellm, uuid
+from .prompt_templates.factory import (
+    prompt_factory,
+    custom_prompt,
+    cohere_message_pt,
+    construct_tool_use_system_prompt,
+    extract_between_tags,
+    parse_xml_params,
+    contains_tag,
+)
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from .base import BaseLLM
 import httpx  # type: ignore
-from .bedrock import BedrockError, convert_messages_to_prompt
+from .bedrock import BedrockError, convert_messages_to_prompt, ModelResponseIterator
 from litellm.types.llms.bedrock import *


@ -280,7 +288,8 @@ class BedrockLLM(BaseLLM):
        messages: List,
        print_verbose,
        encoding,
-    ) -> ModelResponse:
+    ) -> Union[ModelResponse, CustomStreamWrapper]:
+        provider = model.split(".")[0]
        ## LOGGING
        logging_obj.post_call(
            input=messages,
@ -297,26 +306,210 @@ class BedrockLLM(BaseLLM):
            raise BedrockError(message=response.text, status_code=422)

        try:
-            model_response.choices[0].message.content = completion_response["text"]  # type: ignore
+            if provider == "cohere":
+                if "text" in completion_response:
+                    outputText = completion_response["text"]  # type: ignore
+                elif "generations" in completion_response:
+                    outputText = completion_response["generations"][0]["text"]
+                    model_response["finish_reason"] = map_finish_reason(
+                        completion_response["generations"][0]["finish_reason"]
+                    )
+            elif provider == "anthropic":
+                if model.startswith("anthropic.claude-3"):
+                    json_schemas: dict = {}
+                    _is_function_call = False
+                    ## Handle Tool Calling
+                    if "tools" in optional_params:
+                        _is_function_call = True
+                        for tool in optional_params["tools"]:
+                            json_schemas[tool["function"]["name"]] = tool[
+                                "function"
+                            ].get("parameters", None)
+                    outputText = completion_response.get("content")[0].get("text", None)
+                    if outputText is not None and contains_tag(
+                        "invoke", outputText
+                    ):  # OUTPUT PARSE FUNCTION CALL
+                        function_name = extract_between_tags("tool_name", outputText)[0]
+                        function_arguments_str = extract_between_tags(
+                            "invoke", outputText
+                        )[0].strip()
+                        function_arguments_str = (
+                            f"<invoke>{function_arguments_str}</invoke>"
+                        )
+                        function_arguments = parse_xml_params(
+                            function_arguments_str,
+                            json_schema=json_schemas.get(
+                                function_name, None
+                            ),  # check if we have a json schema for this function name)
+                        )
+                        _message = litellm.Message(
+                            tool_calls=[
+                                {
+                                    "id": f"call_{uuid.uuid4()}",
+                                    "type": "function",
+                                    "function": {
+                                        "name": function_name,
+                                        "arguments": json.dumps(function_arguments),
+                                    },
+                                }
+                            ],
+                            content=None,
+                        )
+                        model_response.choices[0].message = _message  # type: ignore
+                        model_response._hidden_params["original_response"] = (
+                            outputText  # allow user to access raw anthropic tool calling response
+                        )
+                    if (
+                        _is_function_call == True
+                        and stream is not None
+                        and stream == True
+                    ):
+                        print_verbose(
+                            f"INSIDE BEDROCK STREAMING TOOL CALLING CONDITION BLOCK"
+                        )
+                        # return an iterator
+                        streaming_model_response = ModelResponse(stream=True)
+                        streaming_model_response.choices[0].finish_reason = getattr(
+                            model_response.choices[0], "finish_reason", "stop"
+                        )
+                        # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
+                        streaming_choice = litellm.utils.StreamingChoices()
+                        streaming_choice.index = model_response.choices[0].index
+                        _tool_calls = []
+                        print_verbose(
+                            f"type of model_response.choices[0]: {type(model_response.choices[0])}"
+                        )
+                        print_verbose(
+                            f"type of streaming_choice: {type(streaming_choice)}"
+                        )
+                        if isinstance(model_response.choices[0], litellm.Choices):
+                            if getattr(
+                                model_response.choices[0].message, "tool_calls", None
+                            ) is not None and isinstance(
+                                model_response.choices[0].message.tool_calls, list
+                            ):
+                                for tool_call in model_response.choices[
+                                    0
+                                ].message.tool_calls:
+                                    _tool_call = {**tool_call.dict(), "index": 0}
+                                    _tool_calls.append(_tool_call)
+                            delta_obj = litellm.utils.Delta(
+                                content=getattr(
+                                    model_response.choices[0].message, "content", None
+                                ),
+                                role=model_response.choices[0].message.role,
+                                tool_calls=_tool_calls,
+                            )
+                            streaming_choice.delta = delta_obj
+                            streaming_model_response.choices = [streaming_choice]
+                            completion_stream = ModelResponseIterator(
+                                model_response=streaming_model_response
+                            )
+                            print_verbose(
+                                f"Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
+                            )
+                            return litellm.CustomStreamWrapper(
+                                completion_stream=completion_stream,
+                                model=model,
+                                custom_llm_provider="cached_response",
+                                logging_obj=logging_obj,
+                            )
+
+                    model_response["finish_reason"] = map_finish_reason(
+                        completion_response.get("stop_reason", "")
+                    )
+                    _usage = litellm.Usage(
+                        prompt_tokens=completion_response["usage"]["input_tokens"],
+                        completion_tokens=completion_response["usage"]["output_tokens"],
+                        total_tokens=completion_response["usage"]["input_tokens"]
+                        + completion_response["usage"]["output_tokens"],
+                    )
+                    setattr(model_response, "usage", _usage)
+                else:
+                    outputText = completion_response["completion"]
+
+                    model_response["finish_reason"] = completion_response["stop_reason"]
+            elif provider == "ai21":
+                outputText = (
+                    completion_response.get("completions")[0].get("data").get("text")
+                )
+            elif provider == "meta":
+                outputText = completion_response["generation"]
+            elif provider == "mistral":
+                outputText = completion_response["outputs"][0]["text"]
+                model_response["finish_reason"] = completion_response["outputs"][0][
+                    "stop_reason"
+                ]
+            else:  # amazon titan
+                outputText = completion_response.get("results")[0].get("outputText")
        except Exception as e:
-            raise BedrockError(message=response.text, status_code=422)
+            raise BedrockError(
+                message="Error processing={}, Received error={}".format(
+                    response.text, str(e)
+                ),
+                status_code=422,
+            )
+
+        try:
+            if (
+                len(outputText) > 0
+                and hasattr(model_response.choices[0], "message")
+                and getattr(model_response.choices[0].message, "tool_calls", None)
+                is None
+            ):
+                model_response["choices"][0]["message"]["content"] = outputText
+            elif (
+                hasattr(model_response.choices[0], "message")
+                and getattr(model_response.choices[0].message, "tool_calls", None)
+                is not None
+            ):
+                pass
+            else:
+                raise Exception()
+        except:
+            raise BedrockError(
+                message=json.dumps(outputText), status_code=response.status_code
+            )
+
+        if stream and provider == "ai21":
+            streaming_model_response = ModelResponse(stream=True)
+            streaming_model_response.choices[0].finish_reason = model_response.choices[  # type: ignore
+                0
+            ].finish_reason
+            # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
+            streaming_choice = litellm.utils.StreamingChoices()
+            streaming_choice.index = model_response.choices[0].index
+            delta_obj = litellm.utils.Delta(
+                content=getattr(model_response.choices[0].message, "content", None),
+                role=model_response.choices[0].message.role,
+            )
+            streaming_choice.delta = delta_obj
+            streaming_model_response.choices = [streaming_choice]
+            mri = ModelResponseIterator(model_response=streaming_model_response)
+            return CustomStreamWrapper(
+                completion_stream=mri,
+                model=model,
+                custom_llm_provider="cached_response",
+                logging_obj=logging_obj,
+            )

        ## CALCULATING USAGE - bedrock returns usage in the headers
-        prompt_tokens = int(
-            response.headers.get(
-                "x-amzn-bedrock-input-token-count",
-                len(encoding.encode("".join(m.get("content", "") for m in messages))),
-            )
+        bedrock_input_tokens = response.headers.get(
+            "x-amzn-bedrock-input-token-count", None
        )
+        bedrock_output_tokens = response.headers.get(
+            "x-amzn-bedrock-output-token-count", None
+        )
+
+        prompt_tokens = int(
+            bedrock_input_tokens or litellm.token_counter(messages=messages)
+        )
+
        completion_tokens = int(
-            response.headers.get(
-                "x-amzn-bedrock-output-token-count",
-                len(
-                    encoding.encode(
-                        model_response.choices[0].message.content,  # type: ignore
-                        disallowed_special=(),
-                    )
-                ),
+            bedrock_output_tokens
+            or litellm.token_counter(
+                text=model_response.choices[0].message.content,  # type: ignore
+                count_response_tokens=True,
            )
        )

@ -359,6 +552,7 @@ class BedrockLLM(BaseLLM):

        ## SETUP ##
        stream = optional_params.pop("stream", None)
+        provider = model.split(".")[0]

        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
@ -414,19 +608,18 @@ class BedrockLLM(BaseLLM):
        else:
            endpoint_url = f"https://bedrock-runtime.{aws_region_name}.amazonaws.com"

-        if stream is not None and stream == True:
+        if (stream is not None and stream == True) and provider != "ai21":
            endpoint_url = f"{endpoint_url}/model/{model}/invoke-with-response-stream"
        else:
            endpoint_url = f"{endpoint_url}/model/{model}/invoke"

        sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)

-        provider = model.split(".")[0]
        prompt, chat_history = self.convert_messages_to_prompt(
            model, messages, provider, custom_prompt_dict
        )
        inference_params = copy.deepcopy(optional_params)
-
+        json_schemas: dict = {}
        if provider == "cohere":
            if model.startswith("cohere.command-r"):
                ## LOAD CONFIG
@ -453,8 +646,114 @@ class BedrockLLM(BaseLLM):
                        True  # cohere requires stream = True in inference params
                    )
                data = json.dumps({"prompt": prompt, **inference_params})
+        elif provider == "anthropic":
+            if model.startswith("anthropic.claude-3"):
+                # Separate system prompt from rest of message
+                system_prompt_idx: list[int] = []
+                system_messages: list[str] = []
+                for idx, message in enumerate(messages):
+                    if message["role"] == "system":
+                        system_messages.append(message["content"])
+                        system_prompt_idx.append(idx)
+                if len(system_prompt_idx) > 0:
+                    inference_params["system"] = "\n".join(system_messages)
+                    messages = [
+                        i for j, i in enumerate(messages) if j not in system_prompt_idx
+                    ]
+                # Format rest of message according to anthropic guidelines
+                messages = prompt_factory(
+                    model=model, messages=messages, custom_llm_provider="anthropic_xml"
+                )  # type: ignore
+                ## LOAD CONFIG
+                config = litellm.AmazonAnthropicClaude3Config.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in inference_params
+                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                        inference_params[k] = v
+                ## Handle Tool Calling
+                if "tools" in inference_params:
+                    _is_function_call = True
+                    for tool in inference_params["tools"]:
+                        json_schemas[tool["function"]["name"]] = tool["function"].get(
+                            "parameters", None
+                        )
+                    tool_calling_system_prompt = construct_tool_use_system_prompt(
+                        tools=inference_params["tools"]
+                    )
+                    inference_params["system"] = (
+                        inference_params.get("system", "\n")
+                        + tool_calling_system_prompt
+                    )  # add the anthropic tool calling prompt to the system prompt
+                    inference_params.pop("tools")
+                data = json.dumps({"messages": messages, **inference_params})
+            else:
+                ## LOAD CONFIG
+                config = litellm.AmazonAnthropicConfig.get_config()
+                for k, v in config.items():
+                    if (
+                        k not in inference_params
+                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                        inference_params[k] = v
+                data = json.dumps({"prompt": prompt, **inference_params})
+        elif provider == "ai21":
+            ## LOAD CONFIG
+            config = litellm.AmazonAI21Config.get_config()
+            for k, v in config.items():
+                if (
+                    k not in inference_params
+                ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({"prompt": prompt, **inference_params})
+        elif provider == "mistral":
+            ## LOAD CONFIG
+            config = litellm.AmazonMistralConfig.get_config()
+            for k, v in config.items():
+                if (
+                    k not in inference_params
+                ):  # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({"prompt": prompt, **inference_params})
+        elif provider == "amazon":  # amazon titan
+            ## LOAD CONFIG
+            config = litellm.AmazonTitanConfig.get_config()
+            for k, v in config.items():
+                if (
+                    k not in inference_params
+                ):  # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps(
+                {
+                    "inputText": prompt,
+                    "textGenerationConfig": inference_params,
+                }
+            )
+        elif provider == "meta":
+            ## LOAD CONFIG
+            config = litellm.AmazonLlamaConfig.get_config()
+            for k, v in config.items():
+                if (
+                    k not in inference_params
+                ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+            data = json.dumps({"prompt": prompt, **inference_params})
        else:
-            raise Exception("UNSUPPORTED PROVIDER")
+            ## LOGGING
+            logging_obj.pre_call(
+                input=messages,
+                api_key="",
+                additional_args={
+                    "complete_input_dict": inference_params,
+                },
+            )
+            raise Exception(
+                "Bedrock HTTPX: Unsupported provider={}, model={}".format(
+                    provider, model
+                )
+            )

        ## COMPLETION CALL

@ -482,7 +781,7 @@ class BedrockLLM(BaseLLM):
        if acompletion:
            if isinstance(client, HTTPHandler):
                client = None
-            if stream:
+            if stream == True and provider != "ai21":
                return self.async_streaming(
                    model=model,
                    messages=messages,
@ -511,7 +810,7 @@ class BedrockLLM(BaseLLM):
                encoding=encoding,
                logging_obj=logging_obj,
                optional_params=optional_params,
-                stream=False,
+                stream=stream,  # type: ignore
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                headers=prepped.headers,
@ -528,7 +827,7 @@ class BedrockLLM(BaseLLM):
            self.client = HTTPHandler(**_params)  # type: ignore
        else:
            self.client = client
-        if stream is not None and stream == True:
+        if (stream is not None and stream == True) and provider != "ai21":
            response = self.client.post(
                url=prepped.url,
                headers=prepped.headers,  # type: ignore
@ -541,7 +840,7 @@ class BedrockLLM(BaseLLM):
                    status_code=response.status_code, message=response.text
                )

-            decoder = AWSEventStreamDecoder()
+            decoder = AWSEventStreamDecoder(model=model)

            completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
            streaming_response = CustomStreamWrapper(
@ -550,15 +849,24 @@ class BedrockLLM(BaseLLM):
                custom_llm_provider="bedrock",
                logging_obj=logging_obj,
            )
+
+            ## LOGGING
+            logging_obj.post_call(
+                input=messages,
+                api_key="",
+                original_response=streaming_response,
+                additional_args={"complete_input_dict": data},
+            )
            return streaming_response

-        response = self.client.post(url=prepped.url, headers=prepped.headers, data=data)  # type: ignore
-
        try:
+            response = self.client.post(url=prepped.url, headers=prepped.headers, data=data)  # type: ignore
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
            raise BedrockError(status_code=error_code, message=response.text)
+        except httpx.TimeoutException as e:
+            raise BedrockError(status_code=408, message="Timeout error occurred.")

        return self.process_response(
            model=model,
@ -591,7 +899,7 @@ class BedrockLLM(BaseLLM):
        logger_fn=None,
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
-    ) -> ModelResponse:
+    ) -> Union[ModelResponse, CustomStreamWrapper]:
        if client is None:
            _params = {}
            if timeout is not None:
@ -602,12 +910,20 @@ class BedrockLLM(BaseLLM):
        else:
            self.client = client  # type: ignore

-        response = await self.client.post(api_base, headers=headers, data=data)  # type: ignore
+        try:
+            response = await self.client.post(api_base, headers=headers, data=data)  # type: ignore
+            response.raise_for_status()
+        except httpx.HTTPStatusError as err:
+            error_code = err.response.status_code
+            raise BedrockError(status_code=error_code, message=response.text)
+        except httpx.TimeoutException as e:
+            raise BedrockError(status_code=408, message="Timeout error occurred.")
+
        return self.process_response(
            model=model,
            response=response,
            model_response=model_response,
-            stream=stream,
+            stream=stream if isinstance(stream, bool) else False,
            logging_obj=logging_obj,
            api_key="",
            data=data,
@ -650,7 +966,7 @@ class BedrockLLM(BaseLLM):
        if response.status_code != 200:
            raise BedrockError(status_code=response.status_code, message=response.text)

-        decoder = AWSEventStreamDecoder()
+        decoder = AWSEventStreamDecoder(model=model)

        completion_stream = decoder.aiter_bytes(response.aiter_bytes(chunk_size=1024))
        streaming_response = CustomStreamWrapper(
@ -659,6 +975,15 @@ class BedrockLLM(BaseLLM):
            custom_llm_provider="bedrock",
            logging_obj=logging_obj,
        )
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=messages,
+            api_key="",
+            original_response=streaming_response,
+            additional_args={"complete_input_dict": data},
+        )
+
        return streaming_response

    def embedding(self, *args, **kwargs):
@ -676,11 +1001,70 @@ def get_response_stream_shape():


 class AWSEventStreamDecoder:
-    def __init__(self) -> None:
+    def __init__(self, model: str) -> None:
        from botocore.parsers import EventStreamJSONParser

+        self.model = model
        self.parser = EventStreamJSONParser()

+    def _chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
+        text = ""
+        is_finished = False
+        finish_reason = ""
+        if "outputText" in chunk_data:
+            text = chunk_data["outputText"]
+        # ai21 mapping
+        if "ai21" in self.model:  # fake ai21 streaming
+            text = chunk_data.get("completions")[0].get("data").get("text")  # type: ignore
+            is_finished = True
+            finish_reason = "stop"
+        ######## bedrock.anthropic mappings ###############
+        elif "completion" in chunk_data:  # not claude-3
+            text = chunk_data["completion"]  # bedrock.anthropic
+            stop_reason = chunk_data.get("stop_reason", None)
+            if stop_reason != None:
+                is_finished = True
+                finish_reason = stop_reason
+        elif "delta" in chunk_data:
+            if chunk_data["delta"].get("text", None) is not None:
+                text = chunk_data["delta"]["text"]
+            stop_reason = chunk_data["delta"].get("stop_reason", None)
+            if stop_reason != None:
+                is_finished = True
+                finish_reason = stop_reason
+        ######## bedrock.mistral mappings ###############
+        elif "outputs" in chunk_data:
+            if (
+                len(chunk_data["outputs"]) == 1
+                and chunk_data["outputs"][0].get("text", None) is not None
+            ):
+                text = chunk_data["outputs"][0]["text"]
+            stop_reason = chunk_data.get("stop_reason", None)
+            if stop_reason != None:
+                is_finished = True
+                finish_reason = stop_reason
+        ######## bedrock.cohere mappings ###############
+        # meta mapping
+        elif "generation" in chunk_data:
+            text = chunk_data["generation"]  # bedrock.meta
+        # cohere mapping
+        elif "text" in chunk_data:
+            text = chunk_data["text"]  # bedrock.cohere
+        # cohere mapping for finish reason
+        elif "finish_reason" in chunk_data:
+            finish_reason = chunk_data["finish_reason"]
+            is_finished = True
+        elif chunk_data.get("completionReason", None):
+            is_finished = True
+            finish_reason = chunk_data["completionReason"]
+        return GenericStreamingChunk(
+            **{
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        )
+
    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
@ -693,12 +1077,7 @@ class AWSEventStreamDecoder:
                if message:
                    # sse_event = ServerSentEvent(data=message, event="completion")
                    _data = json.loads(message)
-                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
-                        text=_data.get("text", ""),
-                        is_finished=_data.get("is_finished", False),
-                        finish_reason=_data.get("finish_reason", ""),
-                    )
-                    yield streaming_chunk
+                    yield self._chunk_parser(chunk_data=_data)

    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
@ -713,12 +1092,7 @@ class AWSEventStreamDecoder:
                message = self._parse_message_from_event(event)
                if message:
                    _data = json.loads(message)
-                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
-                        text=_data.get("text", ""),
-                        is_finished=_data.get("is_finished", False),
-                        finish_reason=_data.get("finish_reason", ""),
-                    )
-                    yield streaming_chunk
+                    yield self._chunk_parser(chunk_data=_data)

    def _parse_message_from_event(self, event) -> Optional[str]:
        response_dict = event.to_response_dict()
--- a/litellm/main.py
+++ b/litellm/main.py
@ -326,7 +326,7 @@ async def acompletion(
            or custom_llm_provider == "sagemaker"
            or custom_llm_provider == "anthropic"
            or custom_llm_provider == "predibase"
-            or (custom_llm_provider == "bedrock" and "cohere" in model)
+            or custom_llm_provider == "bedrock"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -368,6 +368,8 @@ async def acompletion(
 async def _async_streaming(response, model, custom_llm_provider, args):
    try:
        print_verbose(f"received response in _async_streaming: {response}")
+        if asyncio.iscoroutine(response):
+            response = await response
        async for line in response:
            print_verbose(f"line in async streaming: {line}")
            yield line
@ -1979,23 +1981,9 @@ def completion(
            # boto3 reads keys from .env
            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

-            if "cohere" in model:
-                response = bedrock_chat_completion.completion(
-                    model=model,
-                    messages=messages,
-                    custom_prompt_dict=litellm.custom_prompt_dict,
-                    model_response=model_response,
-                    print_verbose=print_verbose,
-                    optional_params=optional_params,
-                    litellm_params=litellm_params,
-                    logger_fn=logger_fn,
-                    encoding=encoding,
-                    logging_obj=logging,
-                    extra_headers=extra_headers,
-                    timeout=timeout,
-                    acompletion=acompletion,
-                )
-            else:
+            if (
+                "aws_bedrock_client" in optional_params
+            ):  # use old bedrock flow for aws_bedrock_client users.
                response = bedrock.completion(
                    model=model,
                    messages=messages,
@ -2031,7 +2019,22 @@ def completion(
                            custom_llm_provider="bedrock",
                            logging_obj=logging,
                        )
-
+            else:
+                response = bedrock_chat_completion.completion(
+                    model=model,
+                    messages=messages,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    logging_obj=logging,
+                    extra_headers=extra_headers,
+                    timeout=timeout,
+                    acompletion=acompletion,
+                )
            if optional_params.get("stream", False):
                ## LOGGING
                logging.post_call(
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-495003b4fc3648e1.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-495003b4fc3648e1.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-f20fdea77aed85ba.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-f20fdea77aed85ba.js
--- a/litellm/proxy/_experimental/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f04e46b02318b660.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[4858,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"884\",\"static/chunks/884-7576ee407a2ecbe6.js\",\"931\",\"static/chunks/app/page-495003b4fc3648e1.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f04e46b02318b660.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"jE-EC3LDs6Y8P0wmind3t\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f04e46b02318b660.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[4858,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"884\",\"static/chunks/884-7576ee407a2ecbe6.js\",\"931\",\"static/chunks/app/page-f20fdea77aed85ba.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f04e46b02318b660.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"l-0LDfSCdaUCAbcLIx_QC\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[4858,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","884","static/chunks/884-7576ee407a2ecbe6.js","931","static/chunks/app/page-495003b4fc3648e1.js"],""]
+3:I[4858,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","884","static/chunks/884-7576ee407a2ecbe6.js","931","static/chunks/app/page-f20fdea77aed85ba.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["jE-EC3LDs6Y8P0wmind3t",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f04e46b02318b660.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["l-0LDfSCdaUCAbcLIx_QC",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f04e46b02318b660.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -1,4 +1,9 @@
 model_list:
+  - model_name: gpt-3.5-turbo-fake-model
+    litellm_params:
+      model: openai/my-fake-model
+      api_base: http://0.0.0.0:8080 
+      api_key: ""
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: azure/gpt-35-turbo
@ -13,6 +18,3 @@ model_list:
  
 router_settings:
  enable_pre_call_checks: true
-
-# general_settings: 
-#   master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -671,15 +671,21 @@ async def user_api_key_auth(
        _end_user_object = None
        end_user_params = {}
        if "user" in request_data:
-            _end_user_object = await get_end_user_object(
-                end_user_id=request_data["user"],
-                prisma_client=prisma_client,
-                user_api_key_cache=user_api_key_cache,
-            )
-            if _end_user_object is not None:
-                end_user_params["allowed_model_region"] = (
-                    _end_user_object.allowed_model_region
+            try:
+                _end_user_object = await get_end_user_object(
+                    end_user_id=request_data["user"],
+                    prisma_client=prisma_client,
+                    user_api_key_cache=user_api_key_cache,
                )
+                if _end_user_object is not None:
+                    end_user_params["allowed_model_region"] = (
+                        _end_user_object.allowed_model_region
+                    )
+            except Exception as e:
+                verbose_proxy_logger.debug(
+                    "Unable to find user in db. Error - {}".format(str(e))
+                )
+                pass

        try:
            is_master_key_valid = secrets.compare_digest(api_key, master_key)  # type: ignore
@ -4920,7 +4926,7 @@ async def token_counter(request: TokenCountRequest):
        litellm_model_name or request.model
    )  # use litellm model name, if it's not avalable then fallback to request.model
    _tokenizer_used = litellm.utils._select_tokenizer(model=model_to_use)
-    tokenizer_used = _tokenizer_used["type"]
+    tokenizer_used = str(_tokenizer_used["type"])
    total_tokens = token_counter(
        model=model_to_use,
        text=prompt,
@ -8134,6 +8140,7 @@ async def add_new_model(
                    await proxy_logging_obj.slack_alerting_instance.model_added_alert(
                        model_name=model_params.model_name,
                        litellm_model_name=_orignal_litellm_model_name,
+                        passed_model_info=model_params.model_info,
                    )
            except:
                pass
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -242,12 +242,24 @@ async def test_langfuse_masked_input_output(langfuse_client):
        response = await create_async_task(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "This is a test"}],
-            metadata={"trace_id": _unique_trace_name, "mask_input": mask_value, "mask_output": mask_value},
-            mock_response="This is a test response"
+            metadata={
+                "trace_id": _unique_trace_name,
+                "mask_input": mask_value,
+                "mask_output": mask_value,
+            },
+            mock_response="This is a test response",
        )
        print(response)
-        expected_input = "redacted-by-litellm" if mask_value else {'messages': [{'content': 'This is a test', 'role': 'user'}]}
-        expected_output = "redacted-by-litellm" if mask_value else {'content': 'This is a test response', 'role': 'assistant'}
+        expected_input = (
+            "redacted-by-litellm"
+            if mask_value
+            else {"messages": [{"content": "This is a test", "role": "user"}]}
+        )
+        expected_output = (
+            "redacted-by-litellm"
+            if mask_value
+            else {"content": "This is a test response", "role": "assistant"}
+        )
        langfuse_client.flush()
        await asyncio.sleep(2)

@ -262,6 +274,7 @@ async def test_langfuse_masked_input_output(langfuse_client):
        assert generations[0].input == expected_input
        assert generations[0].output == expected_output

+
@pytest.mark.asyncio
 async def test_langfuse_logging_metadata(langfuse_client):
    """
@ -523,7 +536,7 @@ def test_langfuse_logging_function_calling():
 # test_langfuse_logging_function_calling()


-def test_langfuse_existing_trace_id():
+def test_aaalangfuse_existing_trace_id():
    """
    When existing trace id is passed, don't set trace params -> prevents overwriting the trace

@ -577,7 +590,7 @@ def test_langfuse_existing_trace_id():
                "verbose": False,
                "custom_llm_provider": "openai",
                "api_base": "https://api.openai.com/v1/",
-                "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+                "litellm_call_id": None,
                "model_alias_map": {},
                "completion_call_id": None,
                "metadata": None,
@ -593,7 +606,7 @@ def test_langfuse_existing_trace_id():
            "stream": False,
            "user": None,
            "call_type": "completion",
-            "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+            "litellm_call_id": None,
            "completion_start_time": "2024-05-01 07:31:29.903685",
            "temperature": 0.1,
            "extra_body": {},
@ -633,6 +646,8 @@ def test_langfuse_existing_trace_id():

    trace_id = langfuse_response_object["trace_id"]

+    assert trace_id is not None
+
    langfuse_client.flush()

    time.sleep(2)
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -7,7 +7,7 @@ import os, io

 sys.path.insert(
    0, os.path.abspath("../..")
-)  # Adds the parent directory to the, system path
+)  # Adds the parent directory to the system path
 import pytest
 import litellm
 from litellm import embedding, completion, completion_cost, Timeout
@ -2301,6 +2301,8 @@ def test_completion_azure_deployment_id():

 # test_completion_azure_deployment_id()

+import asyncio
+

@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
@ -2663,14 +2665,29 @@ def response_format_tests(response: litellm.ModelResponse):


@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bedrock/cohere.command-r-plus-v1:0",
+        "anthropic.claude-3-sonnet-20240229-v1:0",
+        "anthropic.claude-instant-v1",
+        "bedrock/ai21.j2-mid",
+        "mistral.mistral-7b-instruct-v0:2",
+        "bedrock/amazon.titan-tg1-large",
+        "meta.llama3-8b-instruct-v1:0",
+        "cohere.command-text-v14",
+    ],
+)
@pytest.mark.asyncio
-async def test_completion_bedrock_command_r(sync_mode):
+async def test_completion_bedrock_httpx_models(sync_mode, model):
    litellm.set_verbose = True

    if sync_mode:
        response = completion(
-            model="bedrock/cohere.command-r-plus-v1:0",
+            model=model,
            messages=[{"role": "user", "content": "Hey! how's it going?"}],
+            temperature=0.2,
+            max_tokens=200,
        )

        assert isinstance(response, litellm.ModelResponse)
@ -2678,8 +2695,10 @@ async def test_completion_bedrock_command_r(sync_mode):
        response_format_tests(response=response)
    else:
        response = await litellm.acompletion(
-            model="bedrock/cohere.command-r-plus-v1:0",
+            model=model,
            messages=[{"role": "user", "content": "Hey! how's it going?"}],
+            temperature=0.2,
+            max_tokens=100,
        )

        assert isinstance(response, litellm.ModelResponse)
@ -2715,69 +2734,12 @@ def test_completion_bedrock_titan_null_response():
        pytest.fail(f"An error occurred - {str(e)}")


-def test_completion_bedrock_titan():
-    try:
-        response = completion(
-            model="bedrock/amazon.titan-tg1-large",
-            messages=messages,
-            temperature=0.2,
-            max_tokens=200,
-            top_p=0.8,
-            logger_fn=logger_fn,
-        )
-        # Add any assertions here to check the response
-        print(response)
-    except RateLimitError:
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 # test_completion_bedrock_titan()


-def test_completion_bedrock_claude():
-    print("calling claude")
-    try:
-        response = completion(
-            model="anthropic.claude-instant-v1",
-            messages=messages,
-            max_tokens=10,
-            temperature=0.1,
-            logger_fn=logger_fn,
-        )
-        # Add any assertions here to check the response
-        print(response)
-    except RateLimitError:
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 # test_completion_bedrock_claude()


-def test_completion_bedrock_cohere():
-    print("calling bedrock cohere")
-    litellm.set_verbose = True
-    try:
-        response = completion(
-            model="bedrock/cohere.command-text-v14",
-            messages=[{"role": "user", "content": "hi"}],
-            temperature=0.1,
-            max_tokens=10,
-            stream=True,
-        )
-        # Add any assertions here to check the response
-        print(response)
-        for chunk in response:
-            print(chunk)
-    except RateLimitError:
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 # test_completion_bedrock_cohere()


@ -2800,23 +2762,6 @@ def test_completion_bedrock_cohere():
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude_stream()

-# def test_completion_bedrock_ai21():
-#     try:
-#         litellm.set_verbose = False
-#         response = completion(
-#             model="bedrock/ai21.j2-mid",
-#             messages=messages,
-#             temperature=0.2,
-#             top_p=0.2,
-#             max_tokens=20
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
-

 ######## Test VLLM ########
 # def test_completion_vllm():
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -558,7 +558,7 @@ async def test_async_chat_bedrock_stream():
                continue
        except:
            pass
-        time.sleep(1)
+        await asyncio.sleep(1)
        print(f"customHandler.errors: {customHandler.errors}")
        assert len(customHandler.errors) == 0
        litellm.callbacks = []
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1041,14 +1041,27 @@ async def test_completion_replicate_llama3_streaming(sync_mode):


@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.parametrize(
+    "model",
+    [
+        # "bedrock/cohere.command-r-plus-v1:0",
+        # "anthropic.claude-3-sonnet-20240229-v1:0",
+        # "anthropic.claude-instant-v1",
+        # "bedrock/ai21.j2-mid",
+        # "mistral.mistral-7b-instruct-v0:2",
+        # "bedrock/amazon.titan-tg1-large",
+        # "meta.llama3-8b-instruct-v1:0",
+        "cohere.command-text-v14"
+    ],
+)
@pytest.mark.asyncio
-async def test_bedrock_cohere_command_r_streaming(sync_mode):
+async def test_bedrock_httpx_streaming(sync_mode, model):
    try:
        litellm.set_verbose = True
        if sync_mode:
            final_chunk: Optional[litellm.ModelResponse] = None
            response: litellm.CustomStreamWrapper = completion(  # type: ignore
-                model="bedrock/cohere.command-r-plus-v1:0",
+                model=model,
                messages=messages,
                max_tokens=10,  # type: ignore
                stream=True,
@ -1069,7 +1082,7 @@ async def test_bedrock_cohere_command_r_streaming(sync_mode):
                raise Exception("Empty response received")
        else:
            response: litellm.CustomStreamWrapper = await litellm.acompletion(  # type: ignore
-                model="bedrock/cohere.command-r-plus-v1:0",
+                model=model,
                messages=messages,
                max_tokens=100,  # type: ignore
                stream=True,
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -76,6 +76,9 @@ class ModelInfo(BaseModel):
    db_model: bool = (
        False  # used for proxy - to separate models which are stored in the db vs. config.
    )
+    base_model: Optional[str] = (
+        None  # specify if the base model is azure/gpt-3.5-turbo etc for accurate cost tracking
+    )

    def __init__(self, id: Optional[Union[str, int]] = None, **params):
        if id is None:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3853,7 +3853,7 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    )
    if total_time == 0.0:  # total time is in ms
        start_time = completion_response["created"]
-        end_time = completion_response["ended"]
+        end_time = getattr(completion_response, "ended", time.time())
        total_time = end_time - start_time

    return a100_80gb_price_per_second_public * total_time / 1000
@ -8676,7 +8676,7 @@ def exception_type(
                        llm_provider="bedrock",
                        response=original_exception.response,
                    )
-                if "Malformed input request" in error_str:
+                elif "Malformed input request" in error_str:
                    exception_mapping_worked = True
                    raise BadRequestError(
                        message=f"BedrockException - {error_str}",
@ -8684,7 +8684,7 @@ def exception_type(
                        llm_provider="bedrock",
                        response=original_exception.response,
                    )
-                if (
+                elif (
                    "Unable to locate credentials" in error_str
                    or "The security token included in the request is invalid"
                    in error_str
@ -8696,7 +8696,7 @@ def exception_type(
                        llm_provider="bedrock",
                        response=original_exception.response,
                    )
-                if "AccessDeniedException" in error_str:
+                elif "AccessDeniedException" in error_str:
                    exception_mapping_worked = True
                    raise PermissionDeniedError(
                        message=f"BedrockException PermissionDeniedError - {error_str}",
@ -8704,7 +8704,7 @@ def exception_type(
                        llm_provider="bedrock",
                        response=original_exception.response,
                    )
-                if (
+                elif (
                    "throttlingException" in error_str
                    or "ThrottlingException" in error_str
                ):
@ -8715,14 +8715,17 @@ def exception_type(
                        llm_provider="bedrock",
                        response=original_exception.response,
                    )
-                if "Connect timeout on endpoint URL" in error_str:
+                elif (
+                    "Connect timeout on endpoint URL" in error_str
+                    or "timed out" in error_str
+                ):
                    exception_mapping_worked = True
                    raise Timeout(
                        message=f"BedrockException: Timeout Error - {error_str}",
                        model=model,
                        llm_provider="bedrock",
                    )
-                if hasattr(original_exception, "status_code"):
+                elif hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 500:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
@ -8760,6 +8763,49 @@ def exception_type(
                            model=model,
                            response=original_exception.response,
                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            response=original_exception.response,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"BedrockException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
            elif custom_llm_provider == "sagemaker":
                if "Unable to locate credentials" in error_str:
                    exception_mapping_worked = True
@ -10639,75 +10685,11 @@ class CustomStreamWrapper:
            raise e

    def handle_bedrock_stream(self, chunk):
-        if "cohere" in self.model:
-            return {
-                "text": chunk["text"],
-                "is_finished": chunk["is_finished"],
-                "finish_reason": chunk["finish_reason"],
-            }
-        if hasattr(chunk, "get"):
-            chunk = chunk.get("chunk")
-            chunk_data = json.loads(chunk.get("bytes").decode())
-        else:
-            chunk_data = json.loads(chunk.decode())
-        if chunk_data:
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            if "outputText" in chunk_data:
-                text = chunk_data["outputText"]
-            # ai21 mapping
-            if "ai21" in self.model:  # fake ai21 streaming
-                text = chunk_data.get("completions")[0].get("data").get("text")
-                is_finished = True
-                finish_reason = "stop"
-            ######## bedrock.anthropic mappings ###############
-            elif "completion" in chunk_data:  # not claude-3
-                text = chunk_data["completion"]  # bedrock.anthropic
-                stop_reason = chunk_data.get("stop_reason", None)
-                if stop_reason != None:
-                    is_finished = True
-                    finish_reason = stop_reason
-            elif "delta" in chunk_data:
-                if chunk_data["delta"].get("text", None) is not None:
-                    text = chunk_data["delta"]["text"]
-                stop_reason = chunk_data["delta"].get("stop_reason", None)
-                if stop_reason != None:
-                    is_finished = True
-                    finish_reason = stop_reason
-            ######## bedrock.mistral mappings ###############
-            elif "outputs" in chunk_data:
-                if (
-                    len(chunk_data["outputs"]) == 1
-                    and chunk_data["outputs"][0].get("text", None) is not None
-                ):
-                    text = chunk_data["outputs"][0]["text"]
-                stop_reason = chunk_data.get("stop_reason", None)
-                if stop_reason != None:
-                    is_finished = True
-                    finish_reason = stop_reason
-            ######## bedrock.cohere mappings ###############
-            # meta mapping
-            elif "generation" in chunk_data:
-                text = chunk_data["generation"]  # bedrock.meta
-            # cohere mapping
-            elif "text" in chunk_data:
-                text = chunk_data["text"]  # bedrock.cohere
-            # cohere mapping for finish reason
-            elif "finish_reason" in chunk_data:
-                finish_reason = chunk_data["finish_reason"]
-                is_finished = True
-            elif chunk_data.get("completionReason", None):
-                is_finished = True
-                finish_reason = chunk_data["completionReason"]
-            elif chunk.get("error", None):
-                raise Exception(chunk["error"])
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        return ""
+        return {
+            "text": chunk["text"],
+            "is_finished": chunk["is_finished"],
+            "finish_reason": chunk["finish_reason"],
+        }

    def handle_sagemaker_stream(self, chunk):
        if "data: [DONE]" in chunk:
@ -11510,7 +11492,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "replicate"
                or self.custom_llm_provider == "cached_response"
                or self.custom_llm_provider == "predibase"
-                or (self.custom_llm_provider == "bedrock" and "cohere" in self.model)
+                or self.custom_llm_provider == "bedrock"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
                async for chunk in self.completion_stream:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.37.15"
+version = "1.37.16"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -79,7 +79,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.37.15"
+version = "1.37.16"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/test_callbacks_on_proxy.py
+++ b/tests/test_callbacks_on_proxy.py
@ -129,7 +129,7 @@ async def test_check_num_callbacks():
            set(all_litellm_callbacks_1) - set(all_litellm_callbacks_2),
        )

-        assert num_callbacks_1 == num_callbacks_2
+        assert abs(num_callbacks_1 - num_callbacks_2) <= 4

        await asyncio.sleep(30)

@ -142,7 +142,7 @@ async def test_check_num_callbacks():
            set(all_litellm_callbacks_3) - set(all_litellm_callbacks_2),
        )

-        assert num_callbacks_1 == num_callbacks_2 == num_callbacks_3
+        assert abs(num_callbacks_3 - num_callbacks_2) <= 4


@pytest.mark.asyncio
@ -183,7 +183,7 @@ async def test_check_num_callbacks_on_lowest_latency():
            set(all_litellm_callbacks_2) - set(all_litellm_callbacks_1),
        )

-        assert num_callbacks_1 == num_callbacks_2
+        assert abs(num_callbacks_1 - num_callbacks_2) <= 4

        await asyncio.sleep(30)

@ -196,7 +196,7 @@ async def test_check_num_callbacks_on_lowest_latency():
            set(all_litellm_callbacks_3) - set(all_litellm_callbacks_2),
        )

-        assert num_callbacks_1 == num_callbacks_2 == num_callbacks_3
+        assert abs(num_callbacks_2 - num_callbacks_3) <= 4

        assert num_alerts_1 == num_alerts_2 == num_alerts_3

--- a/ui/litellm-dashboard/out/404.html
+++ b/ui/litellm-dashboard/out/404.html
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-495003b4fc3648e1.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-495003b4fc3648e1.js
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-f20fdea77aed85ba.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-f20fdea77aed85ba.js
--- a/ui/litellm-dashboard/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_buildManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_buildManifest.js
--- a/ui/litellm-dashboard/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_ssgManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/l-0LDfSCdaUCAbcLIx_QC/_ssgManifest.js
--- a/ui/litellm-dashboard/out/index.html
+++ b/ui/litellm-dashboard/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f04e46b02318b660.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[4858,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"884\",\"static/chunks/884-7576ee407a2ecbe6.js\",\"931\",\"static/chunks/app/page-495003b4fc3648e1.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f04e46b02318b660.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"jE-EC3LDs6Y8P0wmind3t\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-de9c0fadf6a94b3b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f04e46b02318b660.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[4858,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"884\",\"static/chunks/884-7576ee407a2ecbe6.js\",\"931\",\"static/chunks/app/page-f20fdea77aed85ba.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f04e46b02318b660.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"l-0LDfSCdaUCAbcLIx_QC\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/ui/litellm-dashboard/out/index.txt
+++ b/ui/litellm-dashboard/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[4858,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","884","static/chunks/884-7576ee407a2ecbe6.js","931","static/chunks/app/page-495003b4fc3648e1.js"],""]
+3:I[4858,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","884","static/chunks/884-7576ee407a2ecbe6.js","931","static/chunks/app/page-f20fdea77aed85ba.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["jE-EC3LDs6Y8P0wmind3t",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f04e46b02318b660.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["l-0LDfSCdaUCAbcLIx_QC",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f04e46b02318b660.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/src/components/model_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx
@ -121,6 +121,7 @@ const handleSubmit = async (formValues: Record<string, any>, accessToken: string
      // Iterate through the key-value pairs in formValues
      litellmParamsObj["model"] = litellm_model
      let modelName: string  = "";
+      console.log("formValues add deployment:", formValues);
      for (const [key, value] of Object.entries(formValues)) {
        if (value === '') {
          continue;
@ -628,6 +629,7 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
    let input_cost = "Undefined";
    let output_cost = "Undefined";
    let max_tokens = "Undefined";
+    let max_input_tokens = "Undefined";
    let cleanedLitellmParams = {};

    const getProviderFromModel = (model: string) => {
@ -664,6 +666,7 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
      input_cost = model_info?.input_cost_per_token;
      output_cost = model_info?.output_cost_per_token;
      max_tokens = model_info?.max_tokens;
+      max_input_tokens = model_info?.max_input_tokens;
    }

    if (curr_model?.litellm_params) {
@ -689,6 +692,7 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
    }
    
    modelData.data[i].max_tokens = max_tokens;
+    modelData.data[i].max_input_tokens = max_input_tokens;
    modelData.data[i].api_base = curr_model?.litellm_params?.api_base;
    modelData.data[i].cleanedLitellmParams = cleanedLitellmParams;

@ -936,7 +940,7 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
      <TableHeaderCell style={{ maxWidth: '200px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Extra litellm Params</TableHeaderCell>
      <TableHeaderCell style={{ maxWidth: '85px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Input Price <p style={{ fontSize: '10px', color: 'gray' }}>/1M Tokens ($)</p></TableHeaderCell>
      <TableHeaderCell style={{ maxWidth: '85px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Output Price <p style={{ fontSize: '10px', color: 'gray' }}>/1M Tokens ($)</p></TableHeaderCell>
-      <TableHeaderCell style={{ maxWidth: '85px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Max Tokens</TableHeaderCell>
+      <TableHeaderCell style={{ maxWidth: '120px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Max Tokens</TableHeaderCell>
      <TableHeaderCell style={{ maxWidth: '50px', whiteSpace: 'normal', wordBreak: 'break-word' }}>Status</TableHeaderCell>
    </TableRow>
  </TableHead>
@ -970,7 +974,12 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
          </TableCell>
          <TableCell style={{ maxWidth: '80px', whiteSpace: 'normal', wordBreak: 'break-word' }}>{model.input_cost || model.litellm_params.input_cost_per_token || null}</TableCell>
          <TableCell style={{ maxWidth: '80px', whiteSpace: 'normal', wordBreak: 'break-word' }}>{model.output_cost || model.litellm_params.output_cost_per_token || null}</TableCell>
-          <TableCell style={{ maxWidth: '100px', whiteSpace: 'normal', wordBreak: 'break-word' }}>{model.max_tokens}</TableCell>
+          <TableCell style={{ maxWidth: '120px', whiteSpace: 'normal', wordBreak: 'break-word' }}>
+            <p style={{ fontSize: '10px' }}>
+            Max Tokens: {model.max_tokens} <br></br>
+            Max Input Tokens: {model.max_input_tokens}
+            </p>
+          </TableCell>
          <TableCell style={{ maxWidth: '100px', whiteSpace: 'normal', wordBreak: 'break-word' }}>
            {model.model_info.db_model ? (
              <Badge icon={CheckCircleIcon} size="xs" className="text-white">
@ -1114,13 +1123,22 @@ const handleEditSubmit = async (formValues: Record<string, any>) => {
                </Form.Item>
                }
                {
-                  selectedProvider == Providers.Azure && <Form.Item
-                  label="Base Model"
-                  name="base_model"
-                >
-                  <TextInput placeholder="azure/gpt-3.5-turbo"/>
-                  <Text>The actual model your azure deployment uses. Used for accurate cost tracking. Select name from <Link href="https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" target="_blank">here</Link></Text>
-                </Form.Item>
+                  selectedProvider == Providers.Azure && 
+     
+                    <div>
+                    <Form.Item
+                      label="Base Model"
+                      name="base_model"
+                      className="mb-0"
+                    >
+                    <TextInput placeholder="azure/gpt-3.5-turbo"/>
+                    </Form.Item>
+                <Row>
+                <Col span={10}></Col>
+                <Col span={10}><Text className="mb-2">The actual model your azure deployment uses. Used for accurate cost tracking. Select name from <Link href="https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" target="_blank">here</Link></Text></Col>
+                </Row>
+
+                  </div>
                }
                {
                  selectedProvider == Providers.Bedrock && <Form.Item