Add bedrock latency optimized inference support (#9623)

* fix(converse_transformation.py): add performanceConfig param support on bedrock Closes https://github.com/BerriAI/litellm/issues/7606 * fix(converse_transformation.py): refactor to use more flexible single getter for params which are separate config blocks * test(test_main.py): add e2e mock test for bedrock performance config * build(model_prices_and_context_window.json): add versioned multimodal embedding * refactor(multimodal_embeddings/): migrate to config pattern * feat(vertex_ai/multimodalembeddings): calculate usage for multimodal embedding calls Enables cost calculation for multimodal embeddings * feat(vertex_ai/multimodalembeddings): get usage object for embedding calls ensures accurate cost tracking for vertexai multimodal embedding calls * fix(embedding_handler.py): remove unused imports * fix: fix linting errors * fix: handle response api usage calculation * test(test_vertex_ai_multimodal_embedding_transformation.py): update tests * test: mark flaky test * feat(vertex_ai/multimodal_embeddings/transformation.py): support text+image+video input * docs(vertex.md): document sending text + image to vertex multimodal embeddings * test: remove incorrect file * fix(multimodal_embeddings/transformation.py): fix linting error * style: remove unused import
2025-04-27 11:43:54 +00:00 · 2025-03-29 00:23:09 -07:00 · 2025-03-29 00:23:09 -07:00 · 5ac61a7572
commit 5ac61a7572
parent 0742e6afd6
19 changed files with 806 additions and 245 deletions
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -2080,7 +2080,12 @@ print(response)
 ## **Multi-Modal Embeddings**
-Usage
+
 Known Limitations:
 - Only supports 1 image / video / image per request
 - Only supports GCS or base64 encoded images / videos
 ### Usage
 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -2296,6 +2301,115 @@ print(f"Text Embedding: {embeddings.text_embedding}")
 </Tabs>
 ### Text + Image + Video Embeddings
 <Tabs>
 <TabItem value="sdk" label="SDK">
 Text + Image 
 ```python
 response = await litellm.aembedding(
    model="vertex_ai/multimodalembedding@001",
    input=["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"] # will be sent as a gcs image
 )
 ```
 Text + Video 
 ```python
 response = await litellm.aembedding(
    model="vertex_ai/multimodalembedding@001",
    input=["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
 )
 ```
 Image + Video 
 ```python
 response = await litellm.aembedding(
    model="vertex_ai/multimodalembedding@001",
    input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="LiteLLM PROXY (Unified Endpoint)">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: multimodalembedding@001
    litellm_params:
      model: vertex_ai/multimodalembedding@001
      vertex_project: "adroit-crow-413218"
      vertex_location: "us-central1"
      vertex_credentials: adroit-crow-413218-a956eef1a2a8.json 
 litellm_settings:
  drop_params: True
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request use OpenAI Python SDK, Langchain Python SDK
 Text + Image 
 ```python
 import openai
 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
 # # request sent to model set on litellm proxy, `litellm --model`
 response = client.embeddings.create(
    model="multimodalembedding@001", 
    input = ["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"],
 )
 print(response)
 ```
 Text + Video 
 ```python
 import openai
 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
 # # request sent to model set on litellm proxy, `litellm --model`
 response = client.embeddings.create(
    model="multimodalembedding@001", 
    input = ["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"],
 )
 print(response)
 ```
 Image + Video 
 ```python
 import openai
 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
 # # request sent to model set on litellm proxy, `litellm --model`
 response = client.embeddings.create(
    model="multimodalembedding@001", 
    input = ["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"],
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## **Image Generation Models**
 Usage 
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -2,7 +2,7 @@
 ## File for 'response_cost' calculation in Logging
 import time
 from functools import lru_cache
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, List, Literal, Optional, Tuple, Union, cast
 from pydantic import BaseModel
@ -462,13 +462,36 @@ def _model_contains_known_llm_provider(model: str) -> bool:
 def _get_usage_object(
    completion_response: Any,
 ) -> Optional[Usage]:
-    usage_obj: Optional[Usage] = None
+    usage_obj = cast(
-    if completion_response is not None and isinstance(
+        Union[Usage, ResponseAPIUsage, dict, BaseModel],
-        completion_response, ModelResponse
+        (
-    ):
+            completion_response.get("usage")
-        usage_obj = completion_response.get("usage")
+            if isinstance(completion_response, dict)
            else getattr(completion_response, "get", lambda x: None)("usage")
        ),
    )
    if usage_obj is None:
        return None
    if isinstance(usage_obj, Usage):
        return usage_obj
    elif (
        usage_obj is not None
        and (isinstance(usage_obj, dict) or isinstance(usage_obj, ResponseAPIUsage))
        and ResponseAPILoggingUtils._is_response_api_usage(usage_obj)
    ):
        return ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
            usage_obj
        )
    elif isinstance(usage_obj, dict):
        return Usage(**usage_obj)
    elif isinstance(usage_obj, BaseModel):
        return Usage(**usage_obj.model_dump())
    else:
        verbose_logger.debug(
            f"Unknown usage object type: {type(usage_obj)}, usage_obj: {usage_obj}"
        )
        return None
 def _is_known_usage_objects(usage_obj):
@ -662,6 +685,7 @@ def completion_cost(  # noqa: PLR0915
            elif len(prompt) > 0:
                prompt_tokens = token_counter(model=model, text=prompt)
            completion_tokens = token_counter(model=model, text=completion)
        if model is None:
            raise ValueError(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -121,6 +121,31 @@ def _get_completion_token_base_cost(model_info: ModelInfo, usage: Usage) -> floa
    return model_info["output_cost_per_token"]
 def calculate_cost_component(
    model_info: ModelInfo, cost_key: str, usage_value: Optional[float]
 ) -> float:
    """
    Generic cost calculator for any usage component
    Args:
        model_info: Dictionary containing cost information
        cost_key: The key for the cost multiplier in model_info (e.g., 'input_cost_per_audio_token')
        usage_value: The actual usage value (e.g., number of tokens, characters, seconds)
    Returns:
        float: The calculated cost
    """
    cost_per_unit = model_info.get(cost_key)
    if (
        cost_per_unit is not None
        and isinstance(cost_per_unit, float)
        and usage_value is not None
        and usage_value > 0
    ):
        return float(usage_value) * cost_per_unit
    return 0.0
 def generic_cost_per_token(
    model: str, usage: Usage, custom_llm_provider: str
 ) -> Tuple[float, float]:
@ -136,6 +161,7 @@ def generic_cost_per_token(
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    ## GET MODEL INFO
    model_info = get_model_info(model=model, custom_llm_provider=custom_llm_provider)
@ -146,6 +172,9 @@ def generic_cost_per_token(
    text_tokens = usage.prompt_tokens
    cache_hit_tokens = 0
    audio_tokens = 0
    character_count = 0
    image_count = 0
    video_length_seconds = 0
    if usage.prompt_tokens_details:
        cache_hit_tokens = (
            cast(
@ -163,6 +192,24 @@ def generic_cost_per_token(
            cast(Optional[int], getattr(usage.prompt_tokens_details, "audio_tokens", 0))
            or 0
        )
        character_count = (
            cast(
                Optional[int],
                getattr(usage.prompt_tokens_details, "character_count", 0),
            )
            or 0
        )
        image_count = (
            cast(Optional[int], getattr(usage.prompt_tokens_details, "image_count", 0))
            or 0
        )
        video_length_seconds = (
            cast(
                Optional[int],
                getattr(usage.prompt_tokens_details, "video_length_seconds", 0),
            )
            or 0
        )
    ## EDGE CASE - text tokens not set inside PromptTokensDetails
    if text_tokens == 0:
@ -172,27 +219,37 @@ def generic_cost_per_token(
    prompt_cost = float(text_tokens) * prompt_base_cost
    _cache_read_input_token_cost = model_info.get("cache_read_input_token_cost")
    ### CACHE READ COST
-    if (
+    prompt_cost += calculate_cost_component(
-        _cache_read_input_token_cost is not None
+        model_info, "cache_read_input_token_cost", cache_hit_tokens
-        and cache_hit_tokens is not None
+    )
        and cache_hit_tokens > 0
    ):
        prompt_cost += float(cache_hit_tokens) * _cache_read_input_token_cost
    ### AUDIO COST
-
+    prompt_cost += calculate_cost_component(
-    audio_token_cost = model_info.get("input_cost_per_audio_token")
+        model_info, "input_cost_per_audio_token", audio_tokens
-    if audio_token_cost is not None and audio_tokens is not None and audio_tokens > 0:
+    )
        prompt_cost += float(audio_tokens) * audio_token_cost
    ### CACHE WRITING COST
-    _cache_creation_input_token_cost = model_info.get("cache_creation_input_token_cost")
+    prompt_cost += calculate_cost_component(
-    if _cache_creation_input_token_cost is not None:
+        model_info,
-        prompt_cost += (
+        "cache_creation_input_token_cost",
-            float(usage._cache_creation_input_tokens) * _cache_creation_input_token_cost
+        usage._cache_creation_input_tokens,
    )
    ### CHARACTER COST
    prompt_cost += calculate_cost_component(
        model_info, "input_cost_per_character", character_count
    )
    ### IMAGE COUNT COST
    prompt_cost += calculate_cost_component(
        model_info, "input_cost_per_image", image_count
    )
    ### VIDEO LENGTH COST
    prompt_cost += calculate_cost_component(
        model_info, "input_cost_per_video_per_second", video_length_seconds
    )
    ## CALCULATE OUTPUT COST
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -66,6 +66,13 @@ class AmazonConverseConfig(BaseConfig):
    def custom_llm_provider(self) -> Optional[str]:
        return "bedrock_converse"
    @classmethod
    def get_config_blocks(cls) -> dict:
        return {
            "guardrailConfig": GuardrailConfigBlock,
            "performanceConfig": PerformanceConfigBlock,
        }
    @classmethod
    def get_config(cls):
        return {
@ -394,7 +401,6 @@ class AmazonConverseConfig(BaseConfig):
        optional_params: dict,
        messages: Optional[List[AllMessageValues]] = None,
    ) -> CommonRequestObject:
        ## VALIDATE REQUEST
        """
        Bedrock doesn't support tool calling without `tools=` param specified.
@ -420,11 +426,11 @@ class AmazonConverseConfig(BaseConfig):
            AmazonConverseConfig.__annotations__.keys()
        ) + ["top_k"]
        supported_tool_call_params = ["tools", "tool_choice"]
-        supported_guardrail_params = ["guardrailConfig"]
+        supported_config_params = list(self.get_config_blocks().keys())
        total_supported_params = (
            supported_converse_params
            + supported_tool_call_params
-            + supported_guardrail_params
+            + supported_config_params
        )
        inference_params.pop("json_mode", None)  # used for handling json_schema
@ -463,12 +469,11 @@ class AmazonConverseConfig(BaseConfig):
            ),
        }
-        # Guardrail Config
+        # Handle all config blocks
-        guardrail_config: Optional[GuardrailConfigBlock] = None
+        for config_name, config_class in self.get_config_blocks().items():
-        request_guardrails_config = inference_params.pop("guardrailConfig", None)
+            config_value = inference_params.pop(config_name, None)
-        if request_guardrails_config is not None:
+            if config_value is not None:
-            guardrail_config = GuardrailConfigBlock(**request_guardrails_config)
+                data[config_name] = config_class(**config_value)  # type: ignore
            data["guardrailConfig"] = guardrail_config
        # Tool Config
        if bedrock_tool_config is not None:
--- a/litellm/llms/vertex_ai/cost_calculator.py
+++ b/litellm/llms/vertex_ai/cost_calculator.py
@ -239,6 +239,7 @@ def cost_per_token(
    Raises:
        Exception if model requires >128k pricing, but model cost not mapped
    """
    ## GET MODEL INFO
    model_info = litellm.get_model_info(
        model=model, custom_llm_provider=custom_llm_provider
--- a/litellm/llms/vertex_ai/multimodal_embeddings/embedding_handler.py
+++ b/litellm/llms/vertex_ai/multimodal_embeddings/embedding_handler.py
@ -1,5 +1,5 @@
 import json
-from typing import List, Literal, Optional, Union
+from typing import Literal, Optional, Union
 import httpx
@ -14,15 +14,11 @@ from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
    VertexAIError,
    VertexLLM,
 )
-from litellm.types.llms.vertex_ai import (
+from litellm.types.utils import EmbeddingResponse
-    Instance,
+
-    InstanceImage,
+from .transformation import VertexAIMultimodalEmbeddingConfig
-    InstanceVideo,
+
-    MultimodalPredictions,
+vertex_multimodal_embedding_handler = VertexAIMultimodalEmbeddingConfig()
    VertexMultimodalEmbeddingRequest,
 )
 from litellm.types.utils import Embedding, EmbeddingResponse
 from litellm.utils import is_base64_encoded
 class VertexMultimodalEmbedding(VertexLLM):
@ -41,9 +37,11 @@ class VertexMultimodalEmbedding(VertexLLM):
        model_response: EmbeddingResponse,
        custom_llm_provider: Literal["gemini", "vertex_ai"],
        optional_params: dict,
        litellm_params: dict,
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        headers: dict = {},
        encoding=None,
        vertex_project=None,
        vertex_location=None,
@ -86,31 +84,18 @@ class VertexMultimodalEmbedding(VertexLLM):
        else:
            sync_handler = client  # type: ignore
-        optional_params = optional_params or {}
+        request_data = vertex_multimodal_embedding_handler.transform_embedding_request(
-
+            model, input, optional_params, headers
        request_data = VertexMultimodalEmbeddingRequest()
        if "instances" in optional_params:
            request_data["instances"] = optional_params["instances"]
        elif isinstance(input, list):
            vertex_instances: List[Instance] = self.process_openai_embedding_input(
                _input=input
        )
            request_data["instances"] = vertex_instances
-        else:
+        headers = vertex_multimodal_embedding_handler.validate_environment(
-            # construct instances
+            headers=headers,
-            vertex_request_instance = Instance(**optional_params)
+            model=model,
-
+            messages=[],
-            if isinstance(input, str):
+            optional_params=optional_params,
-                vertex_request_instance = self._process_input_element(input)
+            api_key=auth_header,
-
+            api_base=api_base,
-            request_data["instances"] = [vertex_request_instance]
+        )
        headers = {
            "Content-Type": "application/json; charset=utf-8",
            "Authorization": f"Bearer {auth_header}",
        }
        ## LOGGING
        logging_obj.pre_call(
@ -132,6 +117,10 @@ class VertexMultimodalEmbedding(VertexLLM):
                headers=headers,
                client=client,
                model_response=model_response,
                optional_params=optional_params,
                litellm_params=litellm_params,
                logging_obj=logging_obj,
                api_key=api_key,
            )
        response = sync_handler.post(
@ -140,34 +129,30 @@ class VertexMultimodalEmbedding(VertexLLM):
            data=json.dumps(request_data),
        )
-        if response.status_code != 200:
+        return vertex_multimodal_embedding_handler.transform_embedding_response(
            raise Exception(f"Error: {response.status_code} {response.text}")
        _json_response = response.json()
        if "predictions" not in _json_response:
            raise litellm.InternalServerError(
                message=f"embedding response does not contain 'predictions', got {_json_response}",
                llm_provider="vertex_ai",
            model=model,
            raw_response=response,
            model_response=model_response,
            logging_obj=logging_obj,
            api_key=api_key,
            request_data=request_data,
            optional_params=optional_params,
            litellm_params=litellm_params,
        )
        _predictions = _json_response["predictions"]
        vertex_predictions = MultimodalPredictions(predictions=_predictions)
        model_response.data = self.transform_embedding_response_to_openai(
            predictions=vertex_predictions
        )
        model_response.model = model
        return model_response
    async def async_multimodal_embedding(
        self,
        model: str,
        api_base: str,
-        data: VertexMultimodalEmbeddingRequest,
+        optional_params: dict,
        litellm_params: dict,
        data: dict,
        model_response: litellm.EmbeddingResponse,
        timeout: Optional[Union[float, httpx.Timeout]],
        logging_obj: LiteLLMLoggingObj,
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
        api_key: Optional[str] = None,
    ) -> litellm.EmbeddingResponse:
        if client is None:
            _params = {}
@ -191,112 +176,13 @@ class VertexMultimodalEmbedding(VertexLLM):
        except httpx.TimeoutException:
            raise VertexAIError(status_code=408, message="Timeout error occurred.")
-        _json_response = response.json()
+        return vertex_multimodal_embedding_handler.transform_embedding_response(
        if "predictions" not in _json_response:
            raise litellm.InternalServerError(
                message=f"embedding response does not contain 'predictions', got {_json_response}",
                llm_provider="vertex_ai",
            model=model,
            raw_response=response,
            model_response=model_response,
            logging_obj=logging_obj,
            api_key=api_key,
            request_data=data,
            optional_params=optional_params,
            litellm_params=litellm_params,
        )
        _predictions = _json_response["predictions"]
        vertex_predictions = MultimodalPredictions(predictions=_predictions)
        model_response.data = self.transform_embedding_response_to_openai(
            predictions=vertex_predictions
        )
        model_response.model = model
        return model_response
    def _process_input_element(self, input_element: str) -> Instance:
        """
        Process the input element for multimodal embedding requests. checks if the if the input is gcs uri, base64 encoded image or plain text.
        Args:
            input_element (str): The input element to process.
        Returns:
            Dict[str, Any]: A dictionary representing the processed input element.
        """
        if len(input_element) == 0:
            return Instance(text=input_element)
        elif "gs://" in input_element:
            if "mp4" in input_element:
                return Instance(video=InstanceVideo(gcsUri=input_element))
            else:
                return Instance(image=InstanceImage(gcsUri=input_element))
        elif is_base64_encoded(s=input_element):
            return Instance(
                image=InstanceImage(
                    bytesBase64Encoded=(
                        input_element.split(",")[1]
                        if "," in input_element
                        else input_element
                    )
                )
            )
        else:
            return Instance(text=input_element)
    def process_openai_embedding_input(
        self, _input: Union[list, str]
    ) -> List[Instance]:
        """
        Process the input for multimodal embedding requests.
        Args:
            _input (Union[list, str]): The input data to process.
        Returns:
            List[Instance]: A list of processed VertexAI Instance objects.
        """
        _input_list = None
        if not isinstance(_input, list):
            _input_list = [_input]
        else:
            _input_list = _input
        processed_instances = []
        for element in _input_list:
            if isinstance(element, str):
                instance = Instance(**self._process_input_element(element))
            elif isinstance(element, dict):
                instance = Instance(**element)
            else:
                raise ValueError(f"Unsupported input type: {type(element)}")
            processed_instances.append(instance)
        return processed_instances
    def transform_embedding_response_to_openai(
        self, predictions: MultimodalPredictions
    ) -> List[Embedding]:
        openai_embeddings: List[Embedding] = []
        if "predictions" in predictions:
            for idx, _prediction in enumerate(predictions["predictions"]):
                if _prediction:
                    if "textEmbedding" in _prediction:
                        openai_embedding_object = Embedding(
                            embedding=_prediction["textEmbedding"],
                            index=idx,
                            object="embedding",
                        )
                        openai_embeddings.append(openai_embedding_object)
                    elif "imageEmbedding" in _prediction:
                        openai_embedding_object = Embedding(
                            embedding=_prediction["imageEmbedding"],
                            index=idx,
                            object="embedding",
                        )
                        openai_embeddings.append(openai_embedding_object)
                    elif "videoEmbeddings" in _prediction:
                        for video_embedding in _prediction["videoEmbeddings"]:
                            openai_embedding_object = Embedding(
                                embedding=video_embedding["embedding"],
                                index=idx,
                                object="embedding",
                            )
                            openai_embeddings.append(openai_embedding_object)
        return openai_embeddings
--- a/litellm/llms/vertex_ai/multimodal_embeddings/transformation.py
+++ b/litellm/llms/vertex_ai/multimodal_embeddings/transformation.py
@ -0,0 +1,297 @@
 from typing import List, Optional, Union, cast
 from httpx import Headers, Response
 from litellm.exceptions import InternalServerError
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.llms.base_llm.embedding.transformation import LiteLLMLoggingObj
 from litellm.types.llms.openai import AllEmbeddingInputValues, AllMessageValues
 from litellm.types.llms.vertex_ai import (
    Instance,
    InstanceImage,
    InstanceVideo,
    MultimodalPredictions,
    VertexMultimodalEmbeddingRequest,
 )
 from litellm.types.utils import (
    Embedding,
    EmbeddingResponse,
    PromptTokensDetailsWrapper,
    Usage,
 )
 from litellm.utils import _count_characters, is_base64_encoded
 from ...base_llm.embedding.transformation import BaseEmbeddingConfig
 from ..common_utils import VertexAIError
 class VertexAIMultimodalEmbeddingConfig(BaseEmbeddingConfig):
    def get_supported_openai_params(self, model: str) -> list:
        return ["dimensions"]
    def map_openai_params(
        self,
        non_default_params: dict,
        optional_params: dict,
        model: str,
        drop_params: bool,
    ) -> dict:
        for param, value in non_default_params.items():
            if param == "dimensions":
                optional_params["outputDimensionality"] = value
        return optional_params
    def validate_environment(
        self,
        headers: dict,
        model: str,
        messages: List[AllMessageValues],
        optional_params: dict,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
    ) -> dict:
        default_headers = {
            "Content-Type": "application/json; charset=utf-8",
            "Authorization": f"Bearer {api_key}",
        }
        headers.update(default_headers)
        return headers
    def _process_input_element(self, input_element: str) -> Instance:
        """
        Process the input element for multimodal embedding requests. checks if the if the input is gcs uri, base64 encoded image or plain text.
        Args:
            input_element (str): The input element to process.
        Returns:
            Dict[str, Any]: A dictionary representing the processed input element.
        """
        if len(input_element) == 0:
            return Instance(text=input_element)
        elif "gs://" in input_element:
            if "mp4" in input_element:
                return Instance(video=InstanceVideo(gcsUri=input_element))
            else:
                return Instance(image=InstanceImage(gcsUri=input_element))
        elif is_base64_encoded(s=input_element):
            return Instance(
                image=InstanceImage(
                    bytesBase64Encoded=(
                        input_element.split(",")[1]
                        if "," in input_element
                        else input_element
                    )
                )
            )
        else:
            return Instance(text=input_element)
    def process_openai_embedding_input(
        self, _input: Union[list, str]
    ) -> List[Instance]:
        """
        Process the input for multimodal embedding requests.
        Args:
            _input (Union[list, str]): The input data to process.
        Returns:
            Union[Instance, List[Instance]]: Either a single Instance or list of Instance objects.
        """
        _input_list = [_input] if not isinstance(_input, list) else _input
        processed_instances = []
        i = 0
        while i < len(_input_list):
            current = _input_list[i]
            # Look ahead for potential media elements
            next_elem = _input_list[i + 1] if i + 1 < len(_input_list) else None
            # If current is a text and next is a GCS URI, or current is a GCS URI
            if isinstance(current, str):
                instance_args: Instance = {}
                # Process current element
                if "gs://" not in current:
                    instance_args["text"] = current
                elif "mp4" in current:
                    instance_args["video"] = InstanceVideo(gcsUri=current)
                else:
                    instance_args["image"] = InstanceImage(gcsUri=current)
                # Check next element if it's a GCS URI
                if next_elem and isinstance(next_elem, str) and "gs://" in next_elem:
                    if "mp4" in next_elem:
                        instance_args["video"] = InstanceVideo(gcsUri=next_elem)
                    else:
                        instance_args["image"] = InstanceImage(gcsUri=next_elem)
                    i += 2  # Skip next element since we processed it
                else:
                    i += 1  # Move to next element
                processed_instances.append(instance_args)
                continue
            # Handle dict or other types
            if isinstance(current, dict):
                instance = Instance(**current)
                processed_instances.append(instance)
            else:
                raise ValueError(f"Unsupported input type: {type(current)}")
            i += 1
        return processed_instances
    def transform_embedding_request(
        self,
        model: str,
        input: AllEmbeddingInputValues,
        optional_params: dict,
        headers: dict,
    ) -> dict:
        optional_params = optional_params or {}
        request_data = VertexMultimodalEmbeddingRequest(instances=[])
        if "instances" in optional_params:
            request_data["instances"] = optional_params["instances"]
        elif isinstance(input, list):
            vertex_instances: List[Instance] = self.process_openai_embedding_input(
                _input=input
            )
            request_data["instances"] = vertex_instances
        else:
            # construct instances
            vertex_request_instance = Instance(**optional_params)
            if isinstance(input, str):
                vertex_request_instance = self._process_input_element(input)
            request_data["instances"] = [vertex_request_instance]
        return cast(dict, request_data)
    def transform_embedding_response(
        self,
        model: str,
        raw_response: Response,
        model_response: EmbeddingResponse,
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str],
        request_data: dict,
        optional_params: dict,
        litellm_params: dict,
    ) -> EmbeddingResponse:
        if raw_response.status_code != 200:
            raise Exception(f"Error: {raw_response.status_code} {raw_response.text}")
        _json_response = raw_response.json()
        if "predictions" not in _json_response:
            raise InternalServerError(
                message=f"embedding response does not contain 'predictions', got {_json_response}",
                llm_provider="vertex_ai",
                model=model,
            )
        _predictions = _json_response["predictions"]
        vertex_predictions = MultimodalPredictions(predictions=_predictions)
        model_response.data = self.transform_embedding_response_to_openai(
            predictions=vertex_predictions
        )
        model_response.model = model
        model_response.usage = self.calculate_usage(
            request_data=cast(VertexMultimodalEmbeddingRequest, request_data),
            vertex_predictions=vertex_predictions,
        )
        return model_response
    def calculate_usage(
        self,
        request_data: VertexMultimodalEmbeddingRequest,
        vertex_predictions: MultimodalPredictions,
    ) -> Usage:
        ## Calculate text embeddings usage
        prompt: Optional[str] = None
        character_count: Optional[int] = None
        for instance in request_data["instances"]:
            text = instance.get("text")
            if text:
                if prompt is None:
                    prompt = text
                else:
                    prompt += text
        if prompt is not None:
            character_count = _count_characters(prompt)
        ## Calculate image embeddings usage
        image_count = 0
        for instance in request_data["instances"]:
            if instance.get("image"):
                image_count += 1
        ## Calculate video embeddings usage
        video_length_seconds = 0
        for prediction in vertex_predictions["predictions"]:
            video_embeddings = prediction.get("videoEmbeddings")
            if video_embeddings:
                for embedding in video_embeddings:
                    duration = embedding["endOffsetSec"] - embedding["startOffsetSec"]
                    video_length_seconds += duration
        prompt_tokens_details = PromptTokensDetailsWrapper(
            character_count=character_count,
            image_count=image_count,
            video_length_seconds=video_length_seconds,
        )
        return Usage(
            prompt_tokens=0,
            completion_tokens=0,
            total_tokens=0,
            prompt_tokens_details=prompt_tokens_details,
        )
    def transform_embedding_response_to_openai(
        self, predictions: MultimodalPredictions
    ) -> List[Embedding]:
        openai_embeddings: List[Embedding] = []
        if "predictions" in predictions:
            for idx, _prediction in enumerate(predictions["predictions"]):
                if _prediction:
                    if "textEmbedding" in _prediction:
                        openai_embedding_object = Embedding(
                            embedding=_prediction["textEmbedding"],
                            index=idx,
                            object="embedding",
                        )
                        openai_embeddings.append(openai_embedding_object)
                    elif "imageEmbedding" in _prediction:
                        openai_embedding_object = Embedding(
                            embedding=_prediction["imageEmbedding"],
                            index=idx,
                            object="embedding",
                        )
                        openai_embeddings.append(openai_embedding_object)
                    elif "videoEmbeddings" in _prediction:
                        for video_embedding in _prediction["videoEmbeddings"]:
                            openai_embedding_object = Embedding(
                                embedding=video_embedding["embedding"],
                                index=idx,
                                object="embedding",
                            )
                            openai_embeddings.append(openai_embedding_object)
        return openai_embeddings
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, Headers]
    ) -> BaseLLMException:
        return VertexAIError(
            status_code=status_code, message=error_message, headers=headers
        )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -3723,6 +3723,7 @@ def embedding(  # noqa: PLR0915
                    encoding=encoding,
                    logging_obj=logging,
                    optional_params=optional_params,
                    litellm_params=litellm_params_dict,
                    model_response=EmbeddingResponse(),
                    vertex_project=vertex_ai_project,
                    vertex_location=vertex_ai_location,
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5409,6 +5409,23 @@
        "supported_modalities": ["text", "image", "video"],
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "multimodalembedding@001": {
        "max_tokens": 2048,
        "max_input_tokens": 2048,
        "output_vector_size": 768,
        "input_cost_per_character": 0.0000002,
        "input_cost_per_image": 0.0001,
        "input_cost_per_video_per_second": 0.0005,
        "input_cost_per_video_per_second_above_8s_interval": 0.0010,
        "input_cost_per_video_per_second_above_15s_interval": 0.0020,
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-embedding-models",
        "mode": "embedding",
        "supported_endpoints": ["/v1/embeddings"],
        "supported_modalities": ["text", "image", "video"],
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "text-embedding-large-exp-03-07": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
--- a/litellm/responses/utils.py
+++ b/litellm/responses/utils.py
@ -1,4 +1,4 @@
-from typing import Any, Dict, cast, get_type_hints
+from typing import Any, Dict, Union, cast, get_type_hints
 import litellm
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
@ -78,16 +78,22 @@ class ResponsesAPIRequestUtils:
 class ResponseAPILoggingUtils:
    @staticmethod
-    def _is_response_api_usage(usage: dict) -> bool:
+    def _is_response_api_usage(usage: Union[dict, ResponseAPIUsage]) -> bool:
        """returns True if usage is from OpenAI Response API"""
        if isinstance(usage, ResponseAPIUsage):
            return True
        if "input_tokens" in usage and "output_tokens" in usage:
            return True
        return False
    @staticmethod
-    def _transform_response_api_usage_to_chat_usage(usage: dict) -> Usage:
+    def _transform_response_api_usage_to_chat_usage(
        usage: Union[dict, ResponseAPIUsage]
    ) -> Usage:
        """Tranforms the ResponseAPIUsage object to a Usage object"""
-        response_api_usage: ResponseAPIUsage = ResponseAPIUsage(**usage)
+        response_api_usage: ResponseAPIUsage = (
            ResponseAPIUsage(**usage) if isinstance(usage, dict) else usage
        )
        prompt_tokens: int = response_api_usage.input_tokens or 0
        completion_tokens: int = response_api_usage.output_tokens or 0
        return Usage(
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -191,6 +191,10 @@ class ContentBlockDeltaEvent(TypedDict, total=False):
    reasoningContent: BedrockConverseReasoningContentBlockDelta
 class PerformanceConfigBlock(TypedDict):
    latency: Literal["optimized", "throughput"]
 class CommonRequestObject(
    TypedDict, total=False
 ):  # common request object across sync + async flows
@ -200,6 +204,7 @@ class CommonRequestObject(
    system: List[SystemContentBlock]
    toolConfig: ToolConfigBlock
    guardrailConfig: Optional[GuardrailConfigBlock]
    performanceConfig: Optional[PerformanceConfigBlock]
 class RequestObject(CommonRequestObject, total=False):
@ -429,7 +434,9 @@ class AmazonNovaCanvasColorGuidedGenerationParams(TypedDict, total=False):
    negativeText: str
-class AmazonNovaCanvasColorGuidedRequest(AmazonNovaCanvasRequestBase, TypedDict, total=False):
+class AmazonNovaCanvasColorGuidedRequest(
    AmazonNovaCanvasRequestBase, TypedDict, total=False
 ):
    """
    Request for Amazon Nova Canvas Color Guided Generation API
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@ -369,9 +369,15 @@ class ResponseTuningJob(TypedDict):
    updateTime: Optional[str]
 class VideoSegmentConfig(TypedDict, total=False):
    startOffsetSec: int
    endOffsetSec: int
    intervalSec: int
 class InstanceVideo(TypedDict, total=False):
    gcsUri: str
-    videoSegmentConfig: Tuple[float, float, float]
+    videoSegmentConfig: VideoSegmentConfig
 class InstanceImage(TypedDict, total=False):
@ -386,7 +392,7 @@ class Instance(TypedDict, total=False):
    video: InstanceVideo
-class VertexMultimodalEmbeddingRequest(TypedDict, total=False):
+class VertexMultimodalEmbeddingRequest(TypedDict):
    instances: List[Instance]
@ -402,7 +408,7 @@ class MultimodalPrediction(TypedDict, total=False):
    videoEmbeddings: List[VideoEmbedding]
-class MultimodalPredictions(TypedDict, total=False):
+class MultimodalPredictions(TypedDict):
    predictions: List[MultimodalPrediction]
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -779,6 +779,24 @@ class PromptTokensDetailsWrapper(
    image_tokens: Optional[int] = None
    """Image tokens sent to the model."""
    character_count: Optional[int] = None
    """Character count sent to the model. Used for Vertex AI multimodal embeddings."""
    image_count: Optional[int] = None
    """Number of images sent to the model. Used for Vertex AI multimodal embeddings."""
    video_length_seconds: Optional[float] = None
    """Length of videos sent to the model. Used for Vertex AI multimodal embeddings."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.character_count is None:
            del self.character_count
        if self.image_count is None:
            del self.image_count
        if self.video_length_seconds is None:
            del self.video_length_seconds
 class Usage(CompletionUsage):
    _cache_creation_input_tokens: int = PrivateAttr(
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5409,6 +5409,23 @@
        "supported_modalities": ["text", "image", "video"],
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "multimodalembedding@001": {
        "max_tokens": 2048,
        "max_input_tokens": 2048,
        "output_vector_size": 768,
        "input_cost_per_character": 0.0000002,
        "input_cost_per_image": 0.0001,
        "input_cost_per_video_per_second": 0.0005,
        "input_cost_per_video_per_second_above_8s_interval": 0.0010,
        "input_cost_per_video_per_second_above_15s_interval": 0.0020,
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-embedding-models",
        "mode": "embedding",
        "supported_endpoints": ["/v1/embeddings"],
        "supported_modalities": ["text", "image", "video"],
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "text-embedding-large-exp-03-07": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
--- a/tests/litellm/llms/vertex_ai/multimodal_embeddings/test_embedding_handler.py
+++ b/tests/litellm/llms/vertex_ai/multimodal_embeddings/test_embedding_handler.py
@ -1,43 +0,0 @@
 import json
 import os
 import sys
 from unittest.mock import AsyncMock, MagicMock, patch
 import httpx
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../../../..")
 )  # Adds the parent directory to the system path
 from litellm.llms.vertex_ai.multimodal_embeddings.embedding_handler import (
    VertexMultimodalEmbedding,
 )
 from litellm.types.llms.vertex_ai import Instance, InstanceImage
 class TestVertexMultimodalEmbedding:
    def setup_method(self):
        self.embedding_handler = VertexMultimodalEmbedding()
    def test_process_openai_embedding_input(self):
        input_data = [
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=",
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=",
        ]
        expected_output = [
            Instance(
                image=InstanceImage(bytesBase64Encoded=input_data[0].split(",")[1])
            ),
            Instance(
                image=InstanceImage(bytesBase64Encoded=input_data[1].split(",")[1])
            ),
        ]
        assert (
            self.embedding_handler._process_input_element(input_data[0])
            == expected_output[0]
        )
        assert (
            self.embedding_handler._process_input_element(input_data[1])
            == expected_output[1]
        )
--- a/tests/litellm/llms/vertex_ai/multimodal_embeddings/test_vertex_ai_multimodal_embedding_transformation.py
+++ b/tests/litellm/llms/vertex_ai/multimodal_embeddings/test_vertex_ai_multimodal_embedding_transformation.py
@ -0,0 +1,78 @@
 import json
 import os
 import sys
 from unittest.mock import AsyncMock, MagicMock, patch
 import httpx
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../../../..")
 )  # Adds the parent directory to the system path
 from litellm.llms.vertex_ai.multimodal_embeddings.transformation import (
    VertexAIMultimodalEmbeddingConfig,
 )
 from litellm.types.llms.vertex_ai import Instance, InstanceImage, InstanceVideo
 class TestVertexMultimodalEmbedding:
    def setup_method(self):
        self.config = VertexAIMultimodalEmbeddingConfig()
    def test_process_openai_embedding_input(self):
        input_data = [
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=",
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=",
        ]
        expected_output = [
            Instance(
                image=InstanceImage(bytesBase64Encoded=input_data[0].split(",")[1])
            ),
            Instance(
                image=InstanceImage(bytesBase64Encoded=input_data[1].split(",")[1])
            ),
        ]
        assert self.config._process_input_element(input_data[0]) == expected_output[0]
        assert self.config._process_input_element(input_data[1]) == expected_output[1]
    def test_process_str_and_video_input(self):
        input_data = ["hi", "gs://my-bucket/embeddings/supermarket-video.mp4"]
        expected_output = [
            Instance(
                text="hi",
                video=InstanceVideo(
                    gcsUri="gs://my-bucket/embeddings/supermarket-video.mp4"
                ),
            ),
        ]
        assert self.config.process_openai_embedding_input(input_data) == expected_output
    def test_process_list_of_str_and_str_input(self):
        input_data = ["hi", "hello"]
        expected_output = [
            Instance(text="hi"),
            Instance(text="hello"),
        ]
        assert self.config.process_openai_embedding_input(input_data) == expected_output
    def test_process_list_of_str_and_video_input(self):
        input_data = [
            "hi",
            "hello",
            "gs://my-bucket/embeddings/supermarket-video.mp4",
            "hey",
        ]
        expected_output = [
            Instance(text="hi"),
            Instance(
                text="hello",
                video=InstanceVideo(
                    gcsUri="gs://my-bucket/embeddings/supermarket-video.mp4"
                ),
            ),
            Instance(text="hey"),
        ]
        assert (
            self.config.process_openai_embedding_input(input_data) == expected_output
        ), f"Expected {expected_output}, but got {self.config.process_openai_embedding_input(input_data)}"
--- a/tests/litellm/test_main.py
+++ b/tests/litellm/test_main.py
@ -239,3 +239,23 @@ async def test_url_with_format_param_openai(model, sync_mode):
        json_str = json.dumps(mock_client.call_args.kwargs)
        assert "format" not in json_str
 def test_bedrock_latency_optimized_inference():
    from litellm.llms.custom_httpx.http_handler import HTTPHandler
    client = HTTPHandler()
    with patch.object(client, "post") as mock_post:
        try:
            response = litellm.completion(
                model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
                messages=[{"role": "user", "content": "Hello, how are you?"}],
                performanceConfig={"latency": "optimized"},
                client=client,
            )
        except Exception as e:
            print(e)
        mock_post.assert_called_once()
        json_data = json.loads(mock_post.call_args.kwargs["data"])
        assert json_data["performanceConfig"]["latency"] == "optimized"
--- a/tests/llm_translation/test_openai.py
+++ b/tests/llm_translation/test_openai.py
@ -416,6 +416,7 @@ def validate_web_search_annotations(annotations: ChatCompletionAnnotation):
        validate_response_url_citation(url_citation)
@pytest.mark.flaky(reruns=3)
 def test_openai_web_search():
    """Makes a simple web search request and validates the response contains web search annotations and all expected fields are present"""
    litellm._turn_on_debug()
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -2197,6 +2197,55 @@ def test_vertexai_embedding_embedding_latest():
        pytest.fail(f"Error occurred: {e}")
 def test_vertexai_multimodalembedding_embedding_latest():
    try:
        import requests, base64
        load_vertex_ai_credentials()
        litellm._turn_on_debug()
        response = embedding(
            model="vertex_ai/multimodalembedding@001",
            input=["hi"],
            dimensions=1,
            auto_truncate=True,
            task_type="RETRIEVAL_QUERY",
        )
        print(f"response.usage: {response.usage}")
        assert response.usage is not None
        assert response.usage.prompt_tokens_details is not None
        assert response._hidden_params["response_cost"] > 0
        print(f"response:", response)
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_vertexai_embedding_embedding_latest():
    try:
        load_vertex_ai_credentials()
        litellm.set_verbose = True
        response = embedding(
            model="vertex_ai/text-embedding-004",
            input=["hi"],
            dimensions=1,
            auto_truncate=True,
            task_type="RETRIEVAL_QUERY",
        )
        assert len(response.data[0]["embedding"]) == 1
        assert response.usage.prompt_tokens > 0
        print(f"response:", response)
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="need to get gecko permissions on vertex ai to run this test")
@pytest.mark.flaky(retries=3, delay=1)
 def test_vertexai_embedding_embedding_latest_input_type():