LiteLLM Minor Fixes + Improvements (#5474)

* feat(proxy/_types.py): add lago billing to callbacks ui Closes https://github.com/BerriAI/litellm/issues/5472 * fix(anthropic.py): return anthropic prompt caching information Fixes https://github.com/BerriAI/litellm/issues/5364 * feat(bedrock/chat.py): support 'json_schema' for bedrock models Closes https://github.com/BerriAI/litellm/issues/5434 * fix(bedrock/embed/embeddings.py): support async embeddings for amazon titan models * fix: linting fixes * fix: handle key errors * fix(bedrock/chat.py): fix bedrock ai21 streaming object * feat(bedrock/embed): support bedrock embedding optional params * fix(databricks.py): fix usage chunk * fix(internal_user_endpoints.py): apply internal user defaults, if user role updated Fixes issue where user update wouldn't apply defaults * feat(slack_alerting.py): provide multiple slack channels for a given alert type multiple channels might be interested in receiving an alert for a given type * docs(alerting.md): add multiple channel alerting to docs
2024-09-02 14:29:57 -07:00 · 2024-09-02 14:29:57 -07:00 · f9e6507cd1
commit f9e6507cd1
parent 02f288a8a3
22 changed files with 720 additions and 209 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,12 +1,12 @@
 repos:
 -   repo: local
    hooks:
-    # -   id: mypy
+    -   id: mypy
-    #     name: mypy
+        name: mypy
-    #     entry: python3 -m mypy --ignore-missing-imports
+        entry: python3 -m mypy --ignore-missing-imports
-    #     language: system
+        language: system
-    #     types: [python]
+        types: [python]
-    #     files: ^litellm/
+        files: ^litellm/
    -   id: isort
        name: isort
        entry: isort
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -190,6 +190,36 @@ curl -i http://localhost:4000/v1/chat/completions \
 ```
 ## Advanced - provide multiple slack channels for a given alert type
 Just add it like this - `alert_type: [<hook_url_channel_1>, <hook_url_channel_2>]`. 
 1. Setup config.yaml
 ```yaml
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
  alert_to_webhook_url: {
    "spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"]
  }
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \
 -H 'Authorization: Bearer sk-1234'
 ```
 In case of error, check server logs for the error message!
 ## Advanced - Using MS Teams Webhooks
 MS Teams provides a slack compatible webhook url that you can use for alerting
--- a/litellm/init.py
+++ b/litellm/init.py
@ -900,6 +900,14 @@ from .llms.bedrock.common_utils import (
    AmazonMistralConfig,
    AmazonBedrockGlobalConfig,
 )
 from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
 from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
    AmazonTitanMultimodalEmbeddingG1Config,
 )
 from .llms.bedrock.embed.amazon_titan_v2_transformation import (
    AmazonTitanV2Config,
 )
 from .llms.bedrock.embed.cohere_transformation import BedrockCohereEmbeddingConfig
 from .llms.openai import (
    OpenAIConfig,
    OpenAITextCompletionConfig,
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -1514,7 +1514,9 @@ Model Info:
            self.alert_to_webhook_url is not None
            and alert_type in self.alert_to_webhook_url
        ):
-            slack_webhook_url = self.alert_to_webhook_url[alert_type]
+            slack_webhook_url: Optional[Union[str, List[str]]] = (
                self.alert_to_webhook_url[alert_type]
            )
        elif self.default_webhook_url is not None:
            slack_webhook_url = self.default_webhook_url
        else:
@ -1525,11 +1527,32 @@ Model Info:
        payload = {"text": formatted_message}
        headers = {"Content-type": "application/json"}
-        response = await self.async_http_handler.post(
+        async def send_to_webhook(url: str):
-            url=slack_webhook_url,
+            return await self.async_http_handler.post(
                url=url,
                headers=headers,
                data=json.dumps(payload),
            )
        if isinstance(slack_webhook_url, list):
            # Parallelize the calls if it's a list of URLs
            responses = await asyncio.gather(
                *[send_to_webhook(url) for url in slack_webhook_url]
            )
            for response, url in zip(responses, slack_webhook_url):
                if response.status_code == 200:
                    pass
                else:
                    verbose_proxy_logger.debug(
                        "Error sending slack alert to url={}. Error={}".format(
                            url, response.text
                        )
                    )
        else:
            # Single call if it's a single URL
            response = await send_to_webhook(slack_webhook_url)
            if response.status_code == 200:
                pass
            else:
@ -1718,7 +1741,9 @@ Model Info:
        try:
            from calendar import monthrange
-            from litellm.proxy.proxy_server import _get_spend_report_for_time_range
+            from litellm.proxy.spend_tracking.spend_management_endpoints import (
                _get_spend_report_for_time_range,
            )
            todays_date = datetime.datetime.now().date()
            first_day_of_month = todays_date.replace(day=1)
@ -1763,7 +1788,7 @@ Model Info:
                alerting_metadata={},
            )
        except Exception as e:
-            verbose_proxy_logger.error("Error sending weekly spend report %s", e)
+            verbose_proxy_logger.exception("Error sending weekly spend report %s", e)
    async def send_fallback_stats_from_prometheus(self):
        """
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@ -30,6 +30,7 @@ from litellm.llms.custom_httpx.http_handler import (
 )
 from litellm.types.llms.anthropic import (
    AnthopicMessagesAssistantMessageParam,
    AnthropicChatCompletionUsageBlock,
    AnthropicFinishReason,
    AnthropicMessagesRequest,
    AnthropicMessagesTool,
@ -1177,6 +1178,30 @@ class ModelResponseIterator:
            return True
        return False
    def _handle_usage(
        self, anthropic_usage_chunk: dict
    ) -> AnthropicChatCompletionUsageBlock:
        special_fields = ["input_tokens", "output_tokens"]
        usage_block = AnthropicChatCompletionUsageBlock(
            prompt_tokens=anthropic_usage_chunk.get("input_tokens", 0),
            completion_tokens=anthropic_usage_chunk.get("output_tokens", 0),
            total_tokens=anthropic_usage_chunk.get("input_tokens", 0)
            + anthropic_usage_chunk.get("output_tokens", 0),
        )
        if "cache_creation_input_tokens" in anthropic_usage_chunk:
            usage_block["cache_creation_input_tokens"] = anthropic_usage_chunk[
                "cache_creation_input_tokens"
            ]
        if "cache_read_input_tokens" in anthropic_usage_chunk:
            usage_block["cache_read_input_tokens"] = anthropic_usage_chunk[
                "cache_read_input_tokens"
            ]
        return usage_block
    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
        try:
            type_chunk = chunk.get("type", "") or ""
@ -1252,12 +1277,7 @@ class ModelResponseIterator:
                    finish_reason=message_delta["delta"].get("stop_reason", "stop")
                    or "stop"
                )
-                usage = ChatCompletionUsageBlock(
+                usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"])
                    prompt_tokens=message_delta["usage"].get("input_tokens", 0),
                    completion_tokens=message_delta["usage"].get("output_tokens", 0),
                    total_tokens=message_delta["usage"].get("input_tokens", 0)
                    + message_delta["usage"].get("output_tokens", 0),
                )
                is_finished = True
            elif type_chunk == "message_start":
                """
@ -1280,19 +1300,8 @@ class ModelResponseIterator:
                }
                """
                message_start_block = MessageStartBlock(**chunk)  # type: ignore
-                usage = ChatCompletionUsageBlock(
+                usage = self._handle_usage(
-                    prompt_tokens=message_start_block["message"]
+                    anthropic_usage_chunk=message_start_block["message"]["usage"]
                    .get("usage", {})
                    .get("input_tokens", 0),
                    completion_tokens=message_start_block["message"]
                    .get("usage", {})
                    .get("output_tokens", 0),
                    total_tokens=message_start_block["message"]
                    .get("usage", {})
                    .get("input_tokens", 0)
                    + message_start_block["message"]
                    .get("usage", {})
                    .get("output_tokens", 0),
                )
            elif type_chunk == "error":
                """
--- a/litellm/llms/bedrock/chat.py
+++ b/litellm/llms/bedrock/chat.py
@ -43,6 +43,10 @@ from litellm.types.llms.openai import (
    ChatCompletionResponseMessage,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionToolChoiceFunctionParam,
    ChatCompletionToolChoiceObjectParam,
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
    ChatCompletionUsageBlock,
 )
 from litellm.types.utils import GenericStreamingChunk as GChunk
@ -1152,6 +1156,7 @@ class AmazonConverseConfig:
            "temperature",
            "top_p",
            "extra_headers",
            "response_format",
        ]
        if (
@ -1210,6 +1215,48 @@ class AmazonConverseConfig:
        drop_params: bool,
    ) -> dict:
        for param, value in non_default_params.items():
            if param == "response_format":
                json_schema: Optional[dict] = None
                schema_name: str = ""
                if "response_schema" in value:
                    json_schema = value["response_schema"]
                    schema_name = "json_tool_call"
                elif "json_schema" in value:
                    json_schema = value["json_schema"]["schema"]
                    schema_name = value["json_schema"]["name"]
                """
                Follow similar approach to anthropic - translate to a single tool call. 
                When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
                - You usually want to provide a single tool
                - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
                - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
                """
                if json_schema is not None:
                    _tool_choice = self.map_tool_choice_values(
                        model=model, tool_choice="required", drop_params=drop_params  # type: ignore
                    )
                    _tool = ChatCompletionToolParam(
                        type="function",
                        function=ChatCompletionToolParamFunctionChunk(
                            name=schema_name, parameters=json_schema
                        ),
                    )
                    optional_params["tools"] = [_tool]
                    optional_params["tool_choice"] = _tool_choice
                    optional_params["json_mode"] = True
                else:
                    if litellm.drop_params is True or drop_params is True:
                        pass
                    else:
                        raise litellm.utils.UnsupportedParamsError(
                            message="Bedrock doesn't support response_format={}. To drop it from the call, set `litellm.drop_params = True.".format(
                                value
                            ),
                            status_code=400,
                        )
            if param == "max_tokens":
                optional_params["maxTokens"] = value
            if param == "stream":
@ -1263,7 +1310,7 @@ class BedrockConverseLLM(BaseAWSLLM):
                additional_args={"complete_input_dict": data},
            )
        print_verbose(f"raw model_response: {response.text}")
-
+        json_mode: Optional[bool] = optional_params.pop("json_mode", None)
        ## RESPONSE OBJECT
        try:
            completion_response = ConverseResponseBlock(**response.json())  # type: ignore
@ -1332,6 +1379,7 @@ class BedrockConverseLLM(BaseAWSLLM):
                        name=response_tool_name,
                        arguments=json.dumps(content["toolUse"]["input"]),
                    )
                    _tool_response_chunk = ChatCompletionToolCallChunk(
                        id=content["toolUse"]["toolUseId"],
                        type="function",
@ -1340,6 +1388,13 @@ class BedrockConverseLLM(BaseAWSLLM):
                    )
                    tools.append(_tool_response_chunk)
        chat_completion_message["content"] = content_str
        if json_mode is True and tools is not None and len(tools) == 1:
            # to support 'json_schema' logic on bedrock models
            json_mode_content_str: Optional[str] = tools[0]["function"].get("arguments")
            if json_mode_content_str is not None:
                chat_completion_message["content"] = json_mode_content_str
        else:
            chat_completion_message["tool_calls"] = tools
        ## CALCULATING USAGE - bedrock returns usage in the headers
@ -1586,6 +1641,9 @@ class BedrockConverseLLM(BaseAWSLLM):
        supported_converse_params = AmazonConverseConfig.__annotations__.keys()
        supported_tool_call_params = ["tools", "tool_choice"]
        supported_guardrail_params = ["guardrailConfig"]
        json_mode: Optional[bool] = inference_params.pop(
            "json_mode", None
        )  # used for handling json_schema
        ## TRANSFORMATION ##
        bedrock_messages: List[MessageBlock] = _bedrock_converse_messages_pt(
@ -2028,8 +2086,14 @@ class MockResponseIterator:  # for returning ai21 streaming responses
                text=chunk_data.choices[0].message.content or "",  # type: ignore
                tool_use=None,
                is_finished=True,
-                finish_reason=chunk_data.choices[0].finish_reason,  # type: ignore
+                finish_reason=map_finish_reason(
-                usage=chunk_usage,  # type: ignore
+                    finish_reason=chunk_data.choices[0].finish_reason or ""
                ),
                usage=ChatCompletionUsageBlock(
                    prompt_tokens=chunk_usage.prompt_tokens,
                    completion_tokens=chunk_usage.completion_tokens,
                    total_tokens=chunk_usage.total_tokens,
                ),
                index=0,
            )
            return processed_chunk
--- a/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py
+++ b/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py
@ -15,8 +15,6 @@ from typing import List, Optional
 from litellm.types.llms.bedrock import (
    AmazonTitanG1EmbeddingRequest,
    AmazonTitanG1EmbeddingResponse,
    AmazonTitanV2EmbeddingRequest,
    AmazonTitanV2EmbeddingResponse,
 )
 from litellm.types.utils import Embedding, EmbeddingResponse, Usage
@ -52,6 +50,14 @@ class AmazonTitanG1Config:
            and v is not None
        }
    def get_supported_openai_params(self) -> List[str]:
        return []
    def map_openai_params(
        self, non_default_params: dict, optional_params: dict
    ) -> dict:
        return optional_params
    def _transform_request(
        self, input: str, inference_params: dict
    ) -> AmazonTitanG1EmbeddingRequest:
@ -80,70 +86,3 @@ class AmazonTitanG1Config:
            total_tokens=total_prompt_tokens,
        )
        return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
 class AmazonTitanV2Config:
    """
    Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html
    normalize: boolean - flag indicating whether or not to normalize the output embeddings. Defaults to true
    dimensions: int - The number of dimensions the output embeddings should have. The following values are accepted: 1024 (default), 512, 256.
    """
    normalize: Optional[bool] = None
    dimensions: Optional[int] = None
    def __init__(
        self, normalize: Optional[bool] = None, dimensions: Optional[int] = None
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
    @classmethod
    def get_config(cls):
        return {
            k: v
            for k, v in cls.__dict__.items()
            if not k.startswith("__")
            and not isinstance(
                v,
                (
                    types.FunctionType,
                    types.BuiltinFunctionType,
                    classmethod,
                    staticmethod,
                ),
            )
            and v is not None
        }
    def _transform_request(
        self, input: str, inference_params: dict
    ) -> AmazonTitanV2EmbeddingRequest:
        return AmazonTitanV2EmbeddingRequest(inputText=input, **inference_params)  # type: ignore
    def _transform_response(
        self, response_list: List[dict], model: str
    ) -> EmbeddingResponse:
        total_prompt_tokens = 0
        transformed_responses: List[Embedding] = []
        for index, response in enumerate(response_list):
            _parsed_response = AmazonTitanV2EmbeddingResponse(**response)  # type: ignore
            transformed_responses.append(
                Embedding(
                    embedding=_parsed_response["embedding"],
                    index=index,
                    object="embedding",
                )
            )
            total_prompt_tokens += _parsed_response["inputTextTokenCount"]
        usage = Usage(
            prompt_tokens=total_prompt_tokens,
            completion_tokens=0,
            total_tokens=total_prompt_tokens,
        )
        return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
--- a/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py
+++ b/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py
@ -17,13 +17,36 @@ from litellm.types.utils import Embedding, EmbeddingResponse, Usage
 from litellm.utils import is_base64_encoded
-def _transform_request(
+class AmazonTitanMultimodalEmbeddingG1Config:
-    input: str, inference_params: dict
+    """
-) -> AmazonTitanMultimodalEmbeddingRequest:
+    Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-mm.html
    """
    def __init__(self) -> None:
        pass
    def get_supported_openai_params(self) -> List[str]:
        return ["dimensions"]
    def map_openai_params(
        self, non_default_params: dict, optional_params: dict
    ) -> dict:
        for k, v in non_default_params.items():
            if k == "dimensions":
                optional_params["embeddingConfig"] = (
                    AmazonTitanMultimodalEmbeddingConfig(outputEmbeddingLength=v)
                )
        return optional_params
    def _transform_request(
        self, input: str, inference_params: dict
    ) -> AmazonTitanMultimodalEmbeddingRequest:
        ## check if b64 encoded str or not ##
        is_encoded = is_base64_encoded(input)
        if is_encoded:  # check if string is b64 encoded image or not
-        transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputImage=input)
+            transformed_request = AmazonTitanMultimodalEmbeddingRequest(
                inputImage=input
            )
        else:
            transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputText=input)
@ -32,8 +55,9 @@ def _transform_request(
        return transformed_request
-
+    def _transform_response(
-def _transform_response(response_list: List[dict], model: str) -> EmbeddingResponse:
+        self, response_list: List[dict], model: str
    ) -> EmbeddingResponse:
        total_prompt_tokens = 0
        transformed_responses: List[Embedding] = []
@ -41,7 +65,9 @@ def _transform_response(response_list: List[dict], model: str) -> EmbeddingRespo
            _parsed_response = AmazonTitanMultimodalEmbeddingResponse(**response)  # type: ignore
            transformed_responses.append(
                Embedding(
-                embedding=_parsed_response["embedding"], index=index, object="embedding"
+                    embedding=_parsed_response["embedding"],
                    index=index,
                    object="embedding",
                )
            )
            total_prompt_tokens += _parsed_response["inputTextTokenCount"]
--- a/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py
+++ b/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py
@ -56,6 +56,17 @@ class AmazonTitanV2Config:
            and v is not None
        }
    def get_supported_openai_params(self) -> List[str]:
        return ["dimensions"]
    def map_openai_params(
        self, non_default_params: dict, optional_params: dict
    ) -> dict:
        for k, v in non_default_params.items():
            if k == "dimensions":
                optional_params["dimensions"] = v
        return optional_params
    def _transform_request(
        self, input: str, inference_params: dict
    ) -> AmazonTitanV2EmbeddingRequest:
--- a/litellm/llms/bedrock/embed/cohere_transformation.py
+++ b/litellm/llms/bedrock/embed/cohere_transformation.py
@ -11,9 +11,24 @@ from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingRe
 from litellm.types.utils import Embedding, EmbeddingResponse
-def _transform_request(
+class BedrockCohereEmbeddingConfig:
-    input: List[str], inference_params: dict
+    def __init__(self) -> None:
-) -> CohereEmbeddingRequest:
+        pass
    def get_supported_openai_params(self) -> List[str]:
        return ["encoding_format"]
    def map_openai_params(
        self, non_default_params: dict, optional_params: dict
    ) -> dict:
        for k, v in non_default_params.items():
            if k == "encoding_format":
                optional_params["embedding_types"] = v
        return optional_params
    def _transform_request(
        self, input: List[str], inference_params: dict
    ) -> CohereEmbeddingRequest:
        transformed_request = CohereEmbeddingRequest(
            texts=input,
            input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,  # type: ignore
--- a/litellm/llms/bedrock/embed/embedding.py
+++ b/litellm/llms/bedrock/embed/embedding.py
@ -16,6 +16,7 @@ from litellm.llms.cohere.embed import embedding as cohere_embedding
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
    _get_async_httpx_client,
    _get_httpx_client,
 )
 from litellm.types.llms.bedrock import AmazonEmbeddingRequest, CohereEmbeddingRequest
@ -25,13 +26,10 @@ from ...base_aws_llm import BaseAWSLLM
 from ..common_utils import BedrockError, get_runtime_endpoint
 from .amazon_titan_g1_transformation import AmazonTitanG1Config
 from .amazon_titan_multimodal_transformation import (
-    _transform_request as amazon_multimodal_transform_request,
+    AmazonTitanMultimodalEmbeddingG1Config,
 )
 from .amazon_titan_multimodal_transformation import (
    _transform_response as amazon_multimodal_transform_response,
 )
 from .amazon_titan_v2_transformation import AmazonTitanV2Config
-from .cohere_transformation import _transform_request as cohere_transform_request
+from .cohere_transformation import BedrockCohereEmbeddingConfig
 class BedrockEmbedding(BaseAWSLLM):
@ -118,6 +116,35 @@ class BedrockEmbedding(BaseAWSLLM):
        return response.json()
    async def _make_async_call(
        self,
        client: Optional[AsyncHTTPHandler],
        timeout: Optional[Union[float, httpx.Timeout]],
        api_base: str,
        headers: dict,
        data: dict,
    ) -> dict:
        if client is None or not isinstance(client, AsyncHTTPHandler):
            _params = {}
            if timeout is not None:
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
            client = _get_async_httpx_client(_params)  # type: ignore
        else:
            client = client
        try:
            response = await client.post(url=api_base, headers=headers, data=json.dumps(data))  # type: ignore
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
            raise BedrockError(status_code=error_code, message=response.text)
        except httpx.TimeoutException:
            raise BedrockError(status_code=408, message="Timeout error occurred.")
        return response.json()
    def _single_func_embeddings(
        self,
        client: Optional[HTTPHandler],
@ -186,9 +213,102 @@ class BedrockEmbedding(BaseAWSLLM):
        ## TRANSFORM RESPONSE ##
        if model == "amazon.titan-embed-image-v1":
-            returned_response = amazon_multimodal_transform_response(
+            returned_response = (
                AmazonTitanMultimodalEmbeddingG1Config()._transform_response(
                    response_list=responses, model=model
                )
            )
        elif model == "amazon.titan-embed-text-v1":
            returned_response = AmazonTitanG1Config()._transform_response(
                response_list=responses, model=model
            )
        elif model == "amazon.titan-embed-text-v2:0":
            returned_response = AmazonTitanV2Config()._transform_response(
                response_list=responses, model=model
            )
        if returned_response is None:
            raise Exception(
                "Unable to map model response to known provider format. model={}".format(
                    model
                )
            )
        return returned_response
    async def _async_single_func_embeddings(
        self,
        client: Optional[AsyncHTTPHandler],
        timeout: Optional[Union[float, httpx.Timeout]],
        batch_data: List[dict],
        credentials: Any,
        extra_headers: Optional[dict],
        endpoint_url: str,
        aws_region_name: str,
        model: str,
        logging_obj: Any,
    ):
        try:
            import boto3
            from botocore.auth import SigV4Auth
            from botocore.awsrequest import AWSRequest
            from botocore.credentials import Credentials
        except ImportError:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        responses: List[dict] = []
        for data in batch_data:
            sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)
            headers = {"Content-Type": "application/json"}
            if extra_headers is not None:
                headers = {"Content-Type": "application/json", **extra_headers}
            request = AWSRequest(
                method="POST", url=endpoint_url, data=json.dumps(data), headers=headers
            )
            sigv4.add_auth(request)
            if (
                extra_headers is not None and "Authorization" in extra_headers
            ):  # prevent sigv4 from overwriting the auth header
                request.headers["Authorization"] = extra_headers["Authorization"]
            prepped = request.prepare()
            ## LOGGING
            logging_obj.pre_call(
                input=data,
                api_key="",
                additional_args={
                    "complete_input_dict": data,
                    "api_base": prepped.url,
                    "headers": prepped.headers,
                },
            )
            response = await self._make_async_call(
                client=client,
                timeout=timeout,
                api_base=prepped.url,
                headers=prepped.headers,
                data=data,
            )
            ## LOGGING
            logging_obj.post_call(
                input=data,
                api_key="",
                original_response=response,
                additional_args={"complete_input_dict": data},
            )
            responses.append(response)
        returned_response: Optional[EmbeddingResponse] = None
        ## TRANSFORM RESPONSE ##
        if model == "amazon.titan-embed-image-v1":
            returned_response = (
                AmazonTitanMultimodalEmbeddingG1Config()._transform_response(
                    response_list=responses, model=model
                )
            )
        elif model == "amazon.titan-embed-text-v1":
            returned_response = AmazonTitanG1Config()._transform_response(
                response_list=responses, model=model
@ -246,7 +366,7 @@ class BedrockEmbedding(BaseAWSLLM):
        data: Optional[CohereEmbeddingRequest] = None
        batch_data: Optional[List] = None
        if provider == "cohere":
-            data = cohere_transform_request(
+            data = BedrockCohereEmbeddingConfig()._transform_request(
                input=input, inference_params=inference_params
            )
        elif provider == "amazon" and model in [
@ -257,11 +377,11 @@ class BedrockEmbedding(BaseAWSLLM):
            batch_data = []
            for i in input:
                if model == "amazon.titan-embed-image-v1":
-                    transformed_request: AmazonEmbeddingRequest = (
+                    transformed_request: (
-                        amazon_multimodal_transform_request(
+                        AmazonEmbeddingRequest
                    ) = AmazonTitanMultimodalEmbeddingG1Config()._transform_request(
                        input=i, inference_params=inference_params
                    )
                    )
                elif model == "amazon.titan-embed-text-v1":
                    transformed_request = AmazonTitanG1Config()._transform_request(
                        input=i, inference_params=inference_params
@ -283,6 +403,22 @@ class BedrockEmbedding(BaseAWSLLM):
        endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
        if batch_data is not None:
            if aembedding:
                return self._async_single_func_embeddings(  # type: ignore
                    client=(
                        client
                        if client is not None and isinstance(client, AsyncHTTPHandler)
                        else None
                    ),
                    timeout=timeout,
                    batch_data=batch_data,
                    credentials=credentials,
                    extra_headers=extra_headers,
                    endpoint_url=endpoint_url,
                    aws_region_name=aws_region_name,
                    model=model,
                    logging_obj=logging_obj,
                )
            return self._single_func_embeddings(
                client=(
                    client
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -703,8 +703,16 @@ class ModelResponseIterator:
                is_finished = True
                finish_reason = processed_chunk.choices[0].finish_reason
-            if hasattr(processed_chunk, "usage"):
+            if hasattr(processed_chunk, "usage") and isinstance(
-                usage = processed_chunk.usage  # type: ignore
+                processed_chunk.usage, litellm.Usage
            ):
                usage_chunk: litellm.Usage = processed_chunk.usage
                usage = ChatCompletionUsageBlock(
                    prompt_tokens=usage_chunk.prompt_tokens,
                    completion_tokens=usage_chunk.completion_tokens,
                    total_tokens=usage_chunk.total_tokens,
                )
            return GenericStreamingChunk(
                text=text,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -5431,6 +5431,9 @@ def stream_chunk_builder(
        # # Update usage information if needed
        prompt_tokens = 0
        completion_tokens = 0
        ## anthropic prompt caching information ##
        cache_creation_input_tokens: Optional[int] = None
        cache_read_input_tokens: Optional[int] = None
        for chunk in chunks:
            usage_chunk: Optional[Usage] = None
            if "usage" in chunk:
@ -5442,6 +5445,13 @@ def stream_chunk_builder(
                    prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0
                if "completion_tokens" in usage_chunk:
                    completion_tokens = usage_chunk.get("completion_tokens", 0) or 0
                if "cache_creation_input_tokens" in usage_chunk:
                    cache_creation_input_tokens = usage_chunk.get(
                        "cache_creation_input_tokens"
                    )
                if "cache_read_input_tokens" in usage_chunk:
                    cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens")
        try:
            response["usage"]["prompt_tokens"] = prompt_tokens or token_counter(
                model=model, messages=messages
@ -5460,6 +5470,13 @@ def stream_chunk_builder(
            response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
        )
        if cache_creation_input_tokens is not None:
            response["usage"][
                "cache_creation_input_tokens"
            ] = cache_creation_input_tokens
        if cache_read_input_tokens is not None:
            response["usage"]["cache_read_input_tokens"] = cache_read_input_tokens
        return convert_to_model_response_object(
            response_object=response,
            model_response_object=model_response,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -2,3 +2,16 @@ model_list:
  - model_name: "gpt-3.5-turbo"
    litellm_params:
      model: "gpt-3.5-turbo"
 litellm_settings:
  max_internal_user_budget: 0.02 # amount in USD
  internal_user_budget_duration: "1s" # reset every second 
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
  alert_to_webhook_url: {
    "spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"]
  }
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -1632,6 +1632,16 @@ class AllCallbacks(LiteLLMBase):
        ui_callback_name="Langsmith",
    )
    lago: CallbackOnUI = CallbackOnUI(
        litellm_callback_name="lago",
        litellm_callback_params=[
            "LAGO_API_BASE",
            "LAGO_API_KEY",
            "LAGO_API_EVENT_CODE",
        ],
        ui_callback_name="Lago Billing",
    )
 class SpendLogsMetadata(TypedDict):
    """
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@ -505,6 +505,10 @@ async def user_update(
            ):  # models default to [], spend defaults to 0, we should not reset these values
                non_default_values[k] = v
        is_internal_user = False
        if data.user_role == LitellmUserRoles.INTERNAL_USER:
            is_internal_user = True
        if "budget_duration" in non_default_values:
            duration_s = _duration_in_seconds(
                duration=non_default_values["budget_duration"]
@ -512,6 +516,20 @@ async def user_update(
            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["budget_reset_at"] = user_reset_at
        if "max_budget" not in non_default_values:
            if (
                is_internal_user and litellm.max_internal_user_budget is not None
            ):  # applies internal user limits, if user role updated
                non_default_values["max_budget"] = litellm.max_internal_user_budget
        if (
            "budget_duration" not in non_default_values
        ):  # applies internal user limits, if user role updated
            if is_internal_user and litellm.internal_user_budget_duration is not None:
                non_default_values["budget_duration"] = (
                    litellm.internal_user_budget_duration
                )
        ## ADD USER, IF NEW ##
        verbose_proxy_logger.debug("/user/update: Received data = %s", data)
        if data.user_id is not None and len(data.user_id) > 0:
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@ -282,6 +282,82 @@ async def test_anthropic_api_prompt_caching_no_headers():
    )
@pytest.mark.asyncio()
 async def test_anthropic_api_prompt_caching_streaming():
    from litellm.tests.test_streaming import streaming_format_tests
    response = await litellm.acompletion(
        model="anthropic/claude-3-5-sonnet-20240620",
        messages=[
            # System Message
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": "Here is the full text of a complex legal agreement"
                        * 400,
                        "cache_control": {"type": "ephemeral"},
                    }
                ],
            },
            # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "What are the key terms and conditions in this agreement?",
                        "cache_control": {"type": "ephemeral"},
                    }
                ],
            },
            {
                "role": "assistant",
                "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
            },
            # The final turn is marked with cache-control, for continuing in followups.
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "What are the key terms and conditions in this agreement?",
                        "cache_control": {"type": "ephemeral"},
                    }
                ],
            },
        ],
        temperature=0.2,
        max_tokens=10,
        stream=True,
        stream_options={"include_usage": True},
    )
    idx = 0
    is_cache_read_input_tokens_in_usage = False
    is_cache_creation_input_tokens_in_usage = False
    async for chunk in response:
        streaming_format_tests(idx=idx, chunk=chunk)
        # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
        if hasattr(chunk, "usage"):
            print("Received final usage - {}".format(chunk.usage))
        if hasattr(chunk, "usage") and hasattr(chunk.usage, "cache_read_input_tokens"):
            is_cache_read_input_tokens_in_usage = True
        if hasattr(chunk, "usage") and hasattr(
            chunk.usage, "cache_creation_input_tokens"
        ):
            is_cache_creation_input_tokens_in_usage = True
        idx += 1
    print("response=", response)
    assert (
        is_cache_read_input_tokens_in_usage and is_cache_creation_input_tokens_in_usage
    )
@pytest.mark.asyncio
 async def test_litellm_anthropic_prompt_caching_system():
    # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -2172,7 +2172,14 @@ def test_completion_openai():
        pytest.fail(f"Error occurred: {e}")
-@pytest.mark.parametrize("model", ["gpt-4o-2024-08-06", "azure/chatgpt-v-2"])
+@pytest.mark.parametrize(
    "model",
    [
        "gpt-4o-2024-08-06",
        "azure/chatgpt-v-2",
        "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
    ],
 )
 def test_completion_openai_pydantic(model):
    try:
        litellm.set_verbose = True
@ -2201,7 +2208,7 @@ def test_completion_openai_pydantic(model):
                )
                break
            except litellm.JSONSchemaValidationError:
-                print("ERROR OCCURRED! INVALID JSON")
+                pytest.fail("ERROR OCCURRED! INVALID JSON")
        print("This is the response object\n", response)
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -319,9 +319,52 @@ async def test_cohere_embedding3(custom_llm_provider):
        "bedrock/amazon.titan-embed-text-v2:0",
    ],
 )
-@pytest.mark.parametrize("sync_mode", [True])
+@pytest.mark.parametrize("sync_mode", [True, False])  # ,
@pytest.mark.asyncio
 async def test_bedrock_embedding_titan(model, sync_mode):
    try:
        # this tests if we support str input for bedrock embedding
        litellm.set_verbose = True
        litellm.enable_cache()
        import time
        current_time = str(time.time())
        # DO NOT MAKE THE INPUT A LIST in this test
        if sync_mode:
            response = embedding(
                model=model,
                input=f"good morning from litellm, attempting to embed data {current_time}",  # input should always be a string in this test
                aws_region_name="us-west-2",
            )
        else:
            response = await litellm.aembedding(
                model=model,
                input=f"good morning from litellm, attempting to embed data {current_time}",  # input should always be a string in this test
                aws_region_name="us-west-2",
            )
        print("response:", response)
        assert isinstance(
            response["data"][0]["embedding"], list
        ), "Expected response to be a list"
        print("type of first embedding:", type(response["data"][0]["embedding"][0]))
        assert all(
            isinstance(x, float) for x in response["data"][0]["embedding"]
        ), "Expected response to be a list of floats"
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
    "model",
    [
        "bedrock/amazon.titan-embed-text-v1",
        "bedrock/amazon.titan-embed-image-v1",
        "bedrock/amazon.titan-embed-text-v2:0",
    ],
 )
@pytest.mark.parametrize("sync_mode", [True])  # True,
@pytest.mark.asyncio
 async def test_bedrock_embedding_titan_caching(model, sync_mode):
    try:
        # this tests if we support str input for bedrock embedding
        litellm.set_verbose = True
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -70,13 +70,43 @@ def test_anthropic_optional_params(stop_sequence, expected_count):
 def test_bedrock_optional_params_embeddings():
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
-        user="John", encoding_format=None, custom_llm_provider="bedrock"
+        model="", user="John", encoding_format=None, custom_llm_provider="bedrock"
    )
    assert len(optional_params) == 0
@pytest.mark.parametrize(
    "model, expected_dimensions, dimensions_kwarg",
    [
        ("bedrock/amazon.titan-embed-text-v1", False, None),
        ("bedrock/amazon.titan-embed-image-v1", True, "embeddingConfig"),
        ("bedrock/amazon.titan-embed-text-v2:0", True, "dimensions"),
        ("bedrock/cohere.embed-multilingual-v3", False, None),
    ],
 )
 def test_bedrock_optional_params_embeddings_dimension(
    model, expected_dimensions, dimensions_kwarg
 ):
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
        model=model,
        user="John",
        encoding_format=None,
        dimensions=20,
        custom_llm_provider="bedrock",
    )
    if expected_dimensions:
        assert len(optional_params) == 1
    else:
        assert len(optional_params) == 0
    if dimensions_kwarg is not None:
        assert dimensions_kwarg in optional_params
 def test_google_ai_studio_optional_params_embeddings():
    optional_params = get_optional_params_embeddings(
        model="",
        user="John",
        encoding_format=None,
        custom_llm_provider="gemini",
@ -88,7 +118,7 @@ def test_google_ai_studio_optional_params_embeddings():
 def test_openai_optional_params_embeddings():
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
-        user="John", encoding_format=None, custom_llm_provider="openai"
+        model="", user="John", encoding_format=None, custom_llm_provider="openai"
    )
    assert len(optional_params) == 1
    assert optional_params["user"] == "John"
@ -97,7 +127,10 @@ def test_openai_optional_params_embeddings():
 def test_azure_optional_params_embeddings():
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
-        user="John", encoding_format=None, custom_llm_provider="azure"
+        model="chatgpt-v-2",
        user="John",
        encoding_format=None,
        custom_llm_provider="azure",
    )
    assert len(optional_params) == 1
    assert optional_params["user"] == "John"
@ -455,6 +488,7 @@ def test_get_optional_params_image_gen():
 def test_bedrock_optional_params_embeddings_provider_specific_params():
    optional_params = get_optional_params_embeddings(
        model="my-custom-model",
        custom_llm_provider="huggingface",
        wait_for_model=True,
    )
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -287,3 +287,11 @@ class AnthropicResponse(BaseModel):
    usage: AnthropicResponseUsageBlock
    """Billing and rate-limit usage."""
 class AnthropicChatCompletionUsageBlock(TypedDict, total=False):
    prompt_tokens: Required[int]
    completion_tokens: Required[int]
    total_tokens: Required[int]
    cache_creation_input_tokens: int
    cache_read_input_tokens: int
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2550,7 +2550,7 @@ def get_optional_params_image_gen(
 def get_optional_params_embeddings(
    # 2 optional params
-    model=None,
+    model: str,
    user=None,
    encoding_format=None,
    dimensions=None,
@ -2606,7 +2606,7 @@ def get_optional_params_embeddings(
        ):
            raise UnsupportedParamsError(
                status_code=500,
-                message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
+                message="Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
            )
    if custom_llm_provider == "triton":
        keys = list(non_default_params.keys())
@ -2641,39 +2641,57 @@ def get_optional_params_embeddings(
        )
        final_params = {**optional_params, **kwargs}
        return final_params
    if custom_llm_provider == "vertex_ai":
        if len(non_default_params.keys()) > 0:
            if litellm.drop_params is True:  # drop the unsupported non-default values
                keys = list(non_default_params.keys())
                for k in keys:
                    non_default_params.pop(k, None)
                final_params = {**non_default_params, **kwargs}
                return final_params
            raise UnsupportedParamsError(
                status_code=500,
                message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
            )
    if custom_llm_provider == "bedrock":
        # if dimensions is in non_default_params -> pass it for model=bedrock/amazon.titan-embed-text-v2
-        if (
+        if "amazon.titan-embed-text-v1" in model:
-            "dimensions" in non_default_params.keys()
+            object: Any = litellm.AmazonTitanG1Config()
-            and "amazon.titan-embed-text-v2" in model
+        elif "amazon.titan-embed-image-v1" in model:
-        ):
+            object = litellm.AmazonTitanMultimodalEmbeddingG1Config()
-            kwargs["dimensions"] = non_default_params["dimensions"]
+        elif "amazon.titan-embed-text-v2:0" in model:
-            non_default_params.pop("dimensions", None)
+            object = litellm.AmazonTitanV2Config()
-
+        elif "cohere.embed-multilingual-v3" in model:
-        if len(non_default_params.keys()) > 0:
+            object = litellm.BedrockCohereEmbeddingConfig()
-            if litellm.drop_params is True:  # drop the unsupported non-default values
+        else:  # unmapped model
-                keys = list(non_default_params.keys())
+            supported_params = []
-                for k in keys:
+            _check_valid_arg(supported_params=supported_params)
-                    non_default_params.pop(k, None)
+            final_params = {**kwargs}
                final_params = {**non_default_params, **kwargs}
            return final_params
-            raise UnsupportedParamsError(
+
-                status_code=500,
+        supported_params = object.get_supported_openai_params()
-                message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
+        _check_valid_arg(supported_params=supported_params)
        optional_params = object.map_openai_params(
            non_default_params=non_default_params, optional_params={}
        )
-        return {**non_default_params, **kwargs}
+        final_params = {**optional_params, **kwargs}
        return final_params
        # elif model == "amazon.titan-embed-image-v1":
        #     supported_params = litellm.AmazonTitanG1Config().get_supported_openai_params()
        #     _check_valid_arg(supported_params=supported_params)
        #     optional_params = litellm.AmazonTitanG1Config().map_openai_params(
        #         non_default_params=non_default_params, optional_params={}
        #     )
        #     final_params = {**optional_params, **kwargs}
        #     return final_params
        # if (
        #     "dimensions" in non_default_params.keys()
        #     and "amazon.titan-embed-text-v2" in model
        # ):
        #     kwargs["dimensions"] = non_default_params["dimensions"]
        #     non_default_params.pop("dimensions", None)
        # if len(non_default_params.keys()) > 0:
        #     if litellm.drop_params is True:  # drop the unsupported non-default values
        #         keys = list(non_default_params.keys())
        #         for k in keys:
        #             non_default_params.pop(k, None)
        #         final_params = {**non_default_params, **kwargs}
        #         return final_params
        #     raise UnsupportedParamsError(
        #         status_code=500,
        #         message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
        #     )
        # return {**non_default_params, **kwargs}
    if custom_llm_provider == "mistral":
        supported_params = get_supported_openai_params(
            model=model,
@ -9888,11 +9906,7 @@ class CustomStreamWrapper:
                if anthropic_response_obj["usage"] is not None:
                    model_response.usage = litellm.Usage(
-                        prompt_tokens=anthropic_response_obj["usage"]["prompt_tokens"],
+                        **anthropic_response_obj["usage"]
                        completion_tokens=anthropic_response_obj["usage"][
                            "completion_tokens"
                        ],
                        total_tokens=anthropic_response_obj["usage"]["total_tokens"],
                    )
                if (
@ -10507,10 +10521,10 @@ class CustomStreamWrapper:
                            original_chunk.system_fingerprint
                        )
                        print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-                        if self.sent_first_chunk == False:
+                        if self.sent_first_chunk is False:
                            model_response.choices[0].delta["role"] = "assistant"
                            self.sent_first_chunk = True
-                        elif self.sent_first_chunk == True and hasattr(
+                        elif self.sent_first_chunk is True and hasattr(
                            model_response.choices[0].delta, "role"
                        ):
                            _initial_delta = model_response.choices[
@ -10575,7 +10589,7 @@ class CustomStreamWrapper:
                model_response.choices[0].delta.tool_calls is not None
                or model_response.choices[0].delta.function_call is not None
            ):
-                if self.sent_first_chunk == False:
+                if self.sent_first_chunk is False:
                    model_response.choices[0].delta["role"] = "assistant"
                    self.sent_first_chunk = True
                return model_response