From 3a9468ce9b06dc066d20622c3e5e916163ba2218 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 17 Jan 2025 18:33:40 -0800
Subject: [PATCH] fix again vllm for non base64 (#818)

# What does this PR do?

- previous fix introduced regression for non base64 image
- add back download, and base64 check


## Test Plan

<img width="835" alt="image"
src="https://github.com/user-attachments/assets/b70bf725-035a-4b42-b492-53daaf71458a"
/>


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
---
 llama_stack/providers/remote/inference/vllm/vllm.py     | 3 ++-
 llama_stack/providers/utils/inference/prompt_adapter.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 81c746cce..1dbb4ecfa 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -177,7 +177,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         if isinstance(request, ChatCompletionRequest):
             if media_present:
                 input_dict["messages"] = [
-                    await convert_message_to_openai_dict(m) for m in request.messages
+                    await convert_message_to_openai_dict(m, download=True)
+                    for m in request.messages
                 ]
             else:
                 input_dict["prompt"] = await chat_completion_request_to_prompt(
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index 7ee19fd7b..701b2ca3b 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -188,7 +188,7 @@ async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]:
 async def convert_image_content_to_url(
     media: ImageContentItem, download: bool = False, include_format: bool = True
 ) -> str:
-    if media.url and not download:
+    if media.url and (not download or media.url.uri.startswith("data")):
         return media.url.uri
 
     content, format = await localize_image_content(media)