From 160357d54cbdb7050a536f8c18f32315ad42e155 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 16 Nov 2024 20:05:24 -0800
Subject: [PATCH] (fix) Azure AI Studio - using `image_url` in content with
 both text and image_url  (#6774)

* use helper _audio_or_image_in_message_content

* update azure ai transf

* test_azure_ai_with_image_url
---
 litellm/llms/azure_ai/chat/transformation.py  | 20 +++-
 litellm/llms/prompt_templates/common_utils.py | 14 ++-
 tests/llm_translation/test_azure_ai.py        | 97 +++++++++++++++++++
 3 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/litellm/llms/azure_ai/chat/transformation.py b/litellm/llms/azure_ai/chat/transformation.py
index 9767282fb..d8924fbb9 100644
--- a/litellm/llms/azure_ai/chat/transformation.py
+++ b/litellm/llms/azure_ai/chat/transformation.py
@@ -3,7 +3,10 @@ from typing import List, Optional, Tuple
 import litellm
 from litellm._logging import verbose_logger
 from litellm.llms.OpenAI.openai import OpenAIConfig
-from litellm.llms.prompt_templates.common_utils import convert_content_list_to_str
+from litellm.llms.prompt_templates.common_utils import (
+    _audio_or_image_in_message_content,
+    convert_content_list_to_str,
+)
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ProviderField
@@ -27,8 +30,21 @@ class AzureAIStudioConfig(OpenAIConfig):
             ),
         ]
 
-    def _transform_messages(self, messages: List[AllMessageValues]) -> List:
+    def _transform_messages(
+        self,
+        messages: List[AllMessageValues],
+    ) -> List:
+        """
+        - Azure AI Studio doesn't support content as a list. This handles:
+            1. Transforms list content to a string.
+            2. If message contains an image or audio, send as is (user-intended)
+        """
         for message in messages:
+
+            # Do nothing if the message contains an image or audio
+            if _audio_or_image_in_message_content(message):
+                continue
+
             texts = convert_content_list_to_str(message=message)
             if texts:
                 message["content"] = texts
diff --git a/litellm/llms/prompt_templates/common_utils.py b/litellm/llms/prompt_templates/common_utils.py
index a91ec2170..24cb7b451 100644
--- a/litellm/llms/prompt_templates/common_utils.py
+++ b/litellm/llms/prompt_templates/common_utils.py
@@ -41,7 +41,6 @@ def convert_content_list_to_str(message: AllMessageValues) -> str:
     """
     - handles scenario where content is list and not string
     - content list is just text, and no images
-    - if image passed in, then just return as is (user-intended)
 
     Motivation: mistral api + azure ai don't support content as a list
     """
@@ -59,6 +58,19 @@ def convert_content_list_to_str(message: AllMessageValues) -> str:
     return texts
 
 
+def _audio_or_image_in_message_content(message: AllMessageValues) -> bool:
+    """
+    Checks if message content contains an image or audio
+    """
+    message_content = message.get("content")
+    if message_content:
+        if message_content is not None and isinstance(message_content, list):
+            for c in message_content:
+                if c.get("type") == "image_url" or c.get("type") == "input_audio":
+                    return True
+    return False
+
+
 def convert_openai_message_to_only_content_messages(
     messages: List[AllMessageValues],
 ) -> List[Dict[str, str]]:
diff --git a/tests/llm_translation/test_azure_ai.py b/tests/llm_translation/test_azure_ai.py
index 78e719c52..944e20148 100644
--- a/tests/llm_translation/test_azure_ai.py
+++ b/tests/llm_translation/test_azure_ai.py
@@ -11,6 +11,9 @@ from dotenv import load_dotenv
 import litellm.types
 import litellm.types.utils
 from litellm.llms.anthropic.chat import ModelResponseIterator
+import httpx
+import json
+from respx import MockRouter
 
 load_dotenv()
 import io
@@ -39,3 +42,97 @@ def test_map_azure_model_group(model_group_header, expected_model):
 
     config = AzureAICohereConfig()
     assert config._map_azure_model_group(model_group_header) == expected_model
+
+
+@pytest.mark.asyncio
+@pytest.mark.respx
+async def test_azure_ai_with_image_url(respx_mock: MockRouter):
+    """
+    Important test:
+
+    Test that Azure AI studio can handle image_url passed when content is a list containing both text and image_url
+    """
+    litellm.set_verbose = True
+
+    # Mock response based on the actual API response
+    mock_response = {
+        "id": "cmpl-53860ea1efa24d2883555bfec13d2254",
+        "choices": [
+            {
+                "finish_reason": "stop",
+                "index": 0,
+                "logprobs": None,
+                "message": {
+                    "content": "The image displays a graphic with the text 'LiteLLM' in black",
+                    "role": "assistant",
+                    "refusal": None,
+                    "audio": None,
+                    "function_call": None,
+                    "tool_calls": None,
+                },
+            }
+        ],
+        "created": 1731801937,
+        "model": "phi35-vision-instruct",
+        "object": "chat.completion",
+        "usage": {
+            "completion_tokens": 69,
+            "prompt_tokens": 617,
+            "total_tokens": 686,
+            "completion_tokens_details": None,
+            "prompt_tokens_details": None,
+        },
+    }
+
+    # Mock the API request
+    mock_request = respx_mock.post(
+        "https://Phi-3-5-vision-instruct-dcvov.eastus2.models.ai.azure.com"
+    ).mock(return_value=httpx.Response(200, json=mock_response))
+
+    response = await litellm.acompletion(
+        model="azure_ai/Phi-3-5-vision-instruct-dcvov",
+        api_base="https://Phi-3-5-vision-instruct-dcvov.eastus2.models.ai.azure.com",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What is in this image?",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
+                        },
+                    },
+                ],
+            },
+        ],
+        api_key="fake-api-key",
+    )
+
+    # Verify the request was made
+    assert mock_request.called
+
+    # Check the request body
+    request_body = json.loads(mock_request.calls[0].request.content)
+    assert request_body == {
+        "model": "Phi-3-5-vision-instruct-dcvov",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
+                        },
+                    },
+                ],
+            }
+        ],
+    }
+
+    print(f"response: {response}")