diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 81c746cce..1dbb4ecfa 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -177,7 +177,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         if isinstance(request, ChatCompletionRequest):
             if media_present:
                 input_dict["messages"] = [
-                    await convert_message_to_openai_dict(m) for m in request.messages
+                    await convert_message_to_openai_dict(m, download=True)
+                    for m in request.messages
                 ]
             else:
                 input_dict["prompt"] = await chat_completion_request_to_prompt(
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index 7ee19fd7b..701b2ca3b 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -188,7 +188,7 @@ async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]:
 async def convert_image_content_to_url(
     media: ImageContentItem, download: bool = False, include_format: bool = True
 ) -> str:
-    if media.url and not download:
+    if media.url and (not download or media.url.uri.startswith("data")):
         return media.url.uri
 
     content, format = await localize_image_content(media)