Merge pull request #4266 from BerriAI/litellm_gemini_image_url

Support 'image url' to vertex ai / google ai studio gemini models
2024-06-18 20:39:25 -07:00 · 2024-06-18 20:39:25 -07:00 · 0c2c02ba8d
commit 0c2c02ba8d
parent cacf8e84d0 3546ab39d3
9 changed files with 140 additions and 143 deletions
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -242,7 +242,7 @@ class Logging:
                    extra={"api_base": {api_base}, **masked_headers},
                )
            else:
-                verbose_logger.debug(f"\033[92m{curl_command}\033[0m\n")
+                print_verbose(f"\033[92m{curl_command}\033[0m\n", log_level="DEBUG")
            # log raw request to provider (like LangFuse) -- if opted in.
            if log_raw_request_response is True:
                try:
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -1,24 +1,30 @@
+import json
+import re
+import traceback
+import uuid
+import xml.etree.ElementTree as ET
 from enum import Enum
-import requests, traceback
-import json, re, xml.etree.ElementTree as ET
-from jinja2 import Template, exceptions, meta, BaseLoader
-from jinja2.sandbox import ImmutableSandboxedEnvironment
 from typing import Any, List, Mapping, MutableMapping, Optional, Sequence, Tuple
+
+import requests
+from jinja2 import BaseLoader, Template, exceptions, meta
+from jinja2.sandbox import ImmutableSandboxedEnvironment
+
 import litellm
 import litellm.types
-from litellm.types.completion import (
-    ChatCompletionUserMessageParam,
-    ChatCompletionSystemMessageParam,
-    ChatCompletionMessageParam,
-    ChatCompletionFunctionMessageParam,
-    ChatCompletionMessageToolCallParam,
-    ChatCompletionToolMessageParam,
-)
 import litellm.types.llms
-from litellm.types.llms.anthropic import *
-import uuid
-from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
 import litellm.types.llms.vertex_ai
+from litellm.types.completion import (
+    ChatCompletionFunctionMessageParam,
+    ChatCompletionMessageParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionSystemMessageParam,
+    ChatCompletionToolMessageParam,
+    ChatCompletionUserMessageParam,
+)
+from litellm.types.llms.anthropic import *
+from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
+from litellm.types.utils import GenericImageParsingChunk


 def default_pt(messages):
@ -622,9 +628,10 @@ def construct_tool_use_system_prompt(


 def convert_url_to_base64(url):
-    import requests
    import base64

+    import requests
+
    for _ in range(3):
        try:
            response = requests.get(url)
@ -654,7 +661,7 @@ def convert_url_to_base64(url):
        raise Exception(f"Error: Unable to fetch image from URL. url={url}")


-def convert_to_anthropic_image_obj(openai_image_url: str):
+def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
    """
    Input:
    "image_url": "data:image/jpeg;base64,{base64_image}",
@ -675,11 +682,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str):
        # Infer image format from the URL
        image_format = openai_image_url.split("data:image/")[1].split(";base64,")[0]

-        return {
-            "type": "base64",
-            "media_type": f"image/{image_format}",
-            "data": base64_data,
-        }
+        return GenericImageParsingChunk(
+            type="base64",
+            media_type=f"image/{image_format}",
+            data=base64_data,
+        )
    except Exception as e:
        if "Error: Unable to fetch image from URL" in str(e):
            raise e
@ -1606,19 +1613,23 @@ def azure_text_pt(messages: list):

 ###### AMAZON BEDROCK #######

+from litellm.types.llms.bedrock import ContentBlock as BedrockContentBlock
+from litellm.types.llms.bedrock import ImageBlock as BedrockImageBlock
+from litellm.types.llms.bedrock import ImageSourceBlock as BedrockImageSourceBlock
+from litellm.types.llms.bedrock import ToolBlock as BedrockToolBlock
 from litellm.types.llms.bedrock import (
-    ToolResultContentBlock as BedrockToolResultContentBlock,
-    ToolResultBlock as BedrockToolResultBlock,
-    ToolConfigBlock as BedrockToolConfigBlock,
-    ToolUseBlock as BedrockToolUseBlock,
-    ImageSourceBlock as BedrockImageSourceBlock,
-    ImageBlock as BedrockImageBlock,
-    ContentBlock as BedrockContentBlock,
-    ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
-    ToolSpecBlock as BedrockToolSpecBlock,
-    ToolBlock as BedrockToolBlock,
    ToolChoiceValuesBlock as BedrockToolChoiceValuesBlock,
 )
+from litellm.types.llms.bedrock import ToolConfigBlock as BedrockToolConfigBlock
+from litellm.types.llms.bedrock import (
+    ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
+)
+from litellm.types.llms.bedrock import ToolResultBlock as BedrockToolResultBlock
+from litellm.types.llms.bedrock import (
+    ToolResultContentBlock as BedrockToolResultContentBlock,
+)
+from litellm.types.llms.bedrock import ToolSpecBlock as BedrockToolSpecBlock
+from litellm.types.llms.bedrock import ToolUseBlock as BedrockToolUseBlock


 def get_image_details(image_url) -> Tuple[str, str]:
@ -1655,7 +1666,8 @@ def get_image_details(image_url) -> Tuple[str, str]:
 def _process_bedrock_converse_image_block(image_url: str) -> BedrockImageBlock:
    if "base64" in image_url:
        # Case 1: Images with base64 encoding
-        import base64, re
+        import base64
+        import re

        # base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
        image_metadata, img_without_base_64 = image_url.split(",")
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -1,18 +1,22 @@
-import os, types
+import inspect
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
-from typing import Callable, Optional, Union, List, Literal, Any
+import types
+import uuid
+from enum import Enum
+from typing import Any, Callable, List, Literal, Optional, Union
+
+import httpx  # type: ignore
+import requests  # type: ignore
 from pydantic import BaseModel
-from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
+
+import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
-import litellm, uuid
-import httpx, inspect  # type: ignore
-from litellm.types.llms.vertex_ai import *
 from litellm.llms.prompt_templates.factory import (
-    convert_to_gemini_tool_call_result,
+    convert_to_anthropic_image_obj,
    convert_to_gemini_tool_call_invoke,
+    convert_to_gemini_tool_call_result,
 )
 from litellm.types.files import (
    get_file_mime_type_for_file_type,
@ -20,6 +24,8 @@ from litellm.types.files import (
    is_gemini_1_5_accepted_file_type,
    is_video_file_type,
 )
+from litellm.types.llms.vertex_ai import *
+from litellm.utils import CustomStreamWrapper, ModelResponse, Usage


 class VertexAIError(Exception):
@ -274,28 +280,6 @@ def _get_image_bytes_from_url(image_url: str) -> bytes:
        raise Exception(f"An exception occurs with this image - {str(e)}")


-def _load_image_from_url(image_url: str):
-    """
-    Loads an image from a URL.
-
-    Args:
-        image_url (str): The URL of the image.
-
-    Returns:
-        Image: The loaded image.
-    """
-    from vertexai.preview.generative_models import (
-        GenerativeModel,
-        Part,
-        GenerationConfig,
-        Image,
-    )
-
-    image_bytes = _get_image_bytes_from_url(image_url)
-
-    return Image.from_bytes(data=image_bytes)
-
-
 def _convert_gemini_role(role: str) -> Literal["user", "model"]:
    if role == "user":
        return "user"
@ -323,28 +307,9 @@ def _process_gemini_image(image_url: str) -> PartType:
            return PartType(file_data=file_data)

        # Direct links
-        elif "https:/" in image_url:
-            image = _load_image_from_url(image_url)
-            _blob = BlobType(data=image.data, mime_type=image._mime_type)
-            return PartType(inline_data=_blob)
-
-        # Base64 encoding
-        elif "base64" in image_url:
-            import base64, re
-
-            # base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
-            image_metadata, img_without_base_64 = image_url.split(",")
-
-            # read mime_type from img_without_base_64=data:image/jpeg;base64
-            # Extract MIME type using regular expression
-            mime_type_match = re.match(r"data:(.*?);base64", image_metadata)
-
-            if mime_type_match:
-                mime_type = mime_type_match.group(1)
-            else:
-                mime_type = "image/jpeg"
-            decoded_img = base64.b64decode(img_without_base_64)
-            _blob = BlobType(data=decoded_img, mime_type=mime_type)
+        elif "https:/" in image_url or "base64" in image_url:
+            image = convert_to_anthropic_image_obj(image_url)
+            _blob = BlobType(data=image["data"], mime_type=image["media_type"])
            return PartType(inline_data=_blob)
        raise Exception("Invalid image received - {}".format(image_url))
    except Exception as e:
@ -480,23 +445,25 @@ def completion(
            message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
        )
    try:
+        import google.auth  # type: ignore
+        import proto  # type: ignore
+        from google.cloud import aiplatform  # type: ignore
+        from google.cloud.aiplatform_v1beta1.types import (
+            content as gapic_content_types,  # type: ignore
+        )
+        from google.protobuf import json_format  # type: ignore
+        from google.protobuf.struct_pb2 import Value  # type: ignore
+        from vertexai.language_models import CodeGenerationModel, TextGenerationModel
+        from vertexai.preview.generative_models import (
+            GenerationConfig,
+            GenerativeModel,
+            Part,
+        )
        from vertexai.preview.language_models import (
            ChatModel,
            CodeChatModel,
            InputOutputTextPair,
        )
-        from vertexai.language_models import TextGenerationModel, CodeGenerationModel
-        from vertexai.preview.generative_models import (
-            GenerativeModel,
-            Part,
-            GenerationConfig,
-        )
-        from google.cloud import aiplatform  # type: ignore
-        from google.protobuf import json_format  # type: ignore
-        from google.protobuf.struct_pb2 import Value  # type: ignore
-        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
-        import google.auth  # type: ignore
-        import proto  # type: ignore

        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
@ -1412,8 +1379,8 @@ def embedding(
            message="vertexai import failed please run `pip install google-cloud-aiplatform`",
        )

-    from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
    import google.auth  # type: ignore
+    from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

    ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
    try:
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
@ -21,6 +21,7 @@ import litellm.litellm_core_utils.litellm_logging
 from litellm import verbose_logger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.llms.prompt_templates.factory import convert_url_to_base64
 from litellm.llms.vertex_ai import _gemini_convert_messages_with_history
 from litellm.types.llms.openai import (
    ChatCompletionResponseMessage,
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -568,8 +568,6 @@ async def test_gemini_pro_vision(provider, sync_mode):
        # DO Not DELETE this ASSERT
        # Google counts the prompt tokens for us, we should ensure we use the tokens from the orignal response
        assert prompt_tokens == 263  # the gemini api returns 263 to us
-
-        # assert False
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
@ -1152,38 +1150,44 @@ async def test_vertexai_aembedding():
 #         raise e
 # test_gemini_pro_vision_stream()

-# def test_gemini_pro_vision_async():
-#     try:
-#         litellm.set_verbose = True
-#         litellm.num_retries=0
-#         async def test():
-#             resp = await litellm.acompletion(
-#                 model = "vertex_ai/gemini-pro-vision",
-#                 messages=[
-#                     {
-#                         "role": "user",
-#                         "content": [
-#                                         {
-#                                             "type": "text",
-#                                             "text": "Whats in this image?"
-#                                         },
-#                                         {
-#                                             "type": "image_url",
-#                                             "image_url": {
-#                                             "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-#                                             }
-#                                         }
-#                                     ]
-#                     }
-#                 ],
-#             )
-#             print("async response gemini pro vision")
-#             print(resp)
-#         asyncio.run(test())
-#     except Exception as e:
-#         import traceback
-#         traceback.print_exc()
-#         raise e
+
+def test_gemini_pro_vision_async():
+    try:
+        litellm.set_verbose = True
+        litellm.num_retries = 0
+
+        async def test():
+            load_vertex_ai_credentials()
+            resp = await litellm.acompletion(
+                model="vertex_ai/gemini-pro-vision",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": "Whats in this image?"},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                },
+                            },
+                        ],
+                    }
+                ],
+            )
+            print("async response gemini pro vision")
+            print(resp)
+
+        asyncio.run(test())
+    except litellm.RateLimitError:
+        pass
+    except Exception as e:
+        import traceback
+
+        traceback.print_exc()
+        raise e
+
+
 # test_gemini_pro_vision_async()


--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -694,8 +694,10 @@ def test_completion_claude_3_base64():
            pytest.fail(f"An exception occurred - {str(e)}")


-@pytest.mark.skip(reason="issue getting wikipedia images in ci/cd")
-def test_completion_claude_3_function_plus_image():
+@pytest.mark.parametrize(
+    "model", ["gemini/gemini-1.5-flash"]  # "claude-3-sonnet-20240229",
+)
+def test_completion_function_plus_image(model):
    litellm.set_verbose = True

    image_content = [
@ -703,7 +705,7 @@ def test_completion_claude_3_function_plus_image():
        {
            "type": "image_url",
            "image_url": {
-                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                "url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
            },
        },
    ]
@ -719,7 +721,7 @@ def test_completion_claude_3_function_plus_image():
                    "type": "object",
                    "properties": {
                        "location": {
-                            "type": "text",
+                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
@ -739,7 +741,7 @@ def test_completion_claude_3_function_plus_image():
    ]

    response = completion(
-        model="claude-3-sonnet-20240229",
+        model=model,
        messages=[image_message],
        tool_choice=tool_choice,
        tools=tools,
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@ -39,7 +39,7 @@ class FileDataType(TypedDict):

 class BlobType(TypedDict):
    mime_type: Required[str]
-    data: Required[bytes]
+    data: Required[str]


 class PartType(TypedDict, total=False):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -971,3 +971,14 @@ class TranscriptionResponse(OpenAIObject):
        except:
            # if using pydantic v1
            return self.dict()
+
+
+class GenericImageParsingChunk(TypedDict):
+    # {
+    #         "type": "base64",
+    #         "media_type": f"image/{image_format}",
+    #         "data": base64_data,
+    #     }
+    type: str
+    media_type: str
+    data: str
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2647,7 +2647,7 @@ def get_optional_params(
        if presence_penalty is not None:
            optional_params["presencePenalty"] = {"scale": presence_penalty}
    elif (
-        custom_llm_provider == "palm" or custom_llm_provider == "gemini"
+        custom_llm_provider == "palm"
    ):  # https://developers.generativeai.google/tutorials/curl_quickstart
        ## check if unsupported param passed in
        supported_params = get_supported_openai_params(
@ -2694,7 +2694,7 @@ def get_optional_params(
        print_verbose(
            f"(end) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK - optional_params: {optional_params}"
        )
-    elif custom_llm_provider == "vertex_ai_beta":
+    elif custom_llm_provider == "vertex_ai_beta" or custom_llm_provider == "gemini":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
@ -3726,7 +3726,7 @@ def get_supported_openai_params(
        elif request_type == "embeddings":
            return litellm.DatabricksEmbeddingConfig().get_supported_openai_params()
    elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
-        return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+        return litellm.VertexAIConfig().get_supported_openai_params()
    elif custom_llm_provider == "vertex_ai":
        if request_type == "chat_completion":
            return litellm.VertexAIConfig().get_supported_openai_params()