diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 81c746cce..1dbb4ecfa 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -177,7 +177,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): if isinstance(request, ChatCompletionRequest): if media_present: input_dict["messages"] = [ - await convert_message_to_openai_dict(m) for m in request.messages + await convert_message_to_openai_dict(m, download=True) + for m in request.messages ] else: input_dict["prompt"] = await chat_completion_request_to_prompt( diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py index 7ee19fd7b..701b2ca3b 100644 --- a/llama_stack/providers/utils/inference/prompt_adapter.py +++ b/llama_stack/providers/utils/inference/prompt_adapter.py @@ -188,7 +188,7 @@ async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]: async def convert_image_content_to_url( media: ImageContentItem, download: bool = False, include_format: bool = True ) -> str: - if media.url and not download: + if media.url and (not download or media.url.uri.startswith("data")): return media.url.uri content, format = await localize_image_content(media)