llama-stack-mirror/llama_stack/models/llama/llama3_2/prompts_vision.py

133 lines
5.7 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# top-level folder for each specific model found within the models/ directory at
# the top-level of this source tree.
import textwrap
from pathlib import Path
from llama_stack.models.llama.datatypes import (
RawMediaItem,
RawMessage,
RawTextItem,
)
from ..prompt_format import (
TextCompletionContent,
UseCase,
llama3_1_builtin_tool_call_dialog,
# llama3_1_builtin_tool_call_with_image_dialog,
llama3_2_user_assistant_conversation,
)
def usecases():
this_dir = Path(__file__).parent.parent.resolve()
with open(this_dir / "scripts/resources/dog.jpg", "rb") as f:
img = f.read()
return [
llama3_2_user_assistant_conversation(),
UseCase(
title="User and assistant conversation with Images",
description="This example shows how to pass and image to the model as part of the messages.",
dialogs=[
[
RawMessage(
role="user",
content=[
RawMediaItem(data=img),
RawTextItem(text="Describe this image in two sentences"),
],
)
],
],
notes=textwrap.dedent(
"""
- The `<|image|>` tag is used to indicate presence of the image
- The model isn't an early fusion model so doesn't actually translate an image into several tokens. Instead the cross-attention layers take input "on the side" from a vision encoder
![Image](mm-model.png)
- Its important to postion the <|image|> tag appropriately in the prompt. Image will only attend to the subsequent text tokens
- The <|image|> tag is part of the user message body, implying that it should only come after the header `<|start_header_id|>{role}<|end_header_id|>` in the message body
- We recommend using a single image in one prompt
"""
),
),
UseCase(
title="Builtin and Zero Shot Tool Calling",
description=textwrap.dedent(
"""
Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only.
Use `Environment: ipython` to enable tools.
Add `Tools: {{tool_name1}},{{tool_name2}}` for each of the builtin tools.
The same builtin tools as Llama3.1 are available,
- code_interpreter (for executing python code)
- brave_search (to search the web)
- wolfram_alpha (for querying wolfram alpha for mathematical questions)
""",
),
dialogs=[llama3_1_builtin_tool_call_dialog()],
notes=textwrap.dedent(
"""
- Note the `<|python_tag|>` before `brave_search` function call.
- The `<|eom_id|>` tag is used to indicate the end of the message.
- Similar to Llama3.1, code_interpreter is not explicitly mentioned but is enabled via `Environment: ipython`.
- Tool Calling does NOT work with images in the prompt as of now.
"""
),
),
# UseCase(
# title="Tool Calling for vision models",
# description=textwrap.dedent(
# """
# While Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only,
# they are not able to do tool calling when prompt contains image inputs (along with text).
# The recommended way would be to separate out the image understanding from the tool calling in successive prompts.
# Here is an example of how that could be done,
# """,
# ),
# dialogs=[llama3_1_builtin_tool_call_with_image_dialog()],
# notes=textwrap.dedent(
# """
# - Instead of a single prompt (image understanding + tool call), we split into two prompts to achieve the same result.
# """
# ),
# ),
UseCase(
title="Prompt format for base models",
description=textwrap.dedent(
"""
For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), the prompt format for a simple completion is as follows
"""
),
dialogs=[
TextCompletionContent(content="The color of the sky is blue but sometimes it can also be"),
],
notes="- Same as Llama3.1",
),
UseCase(
title="Prompt format for base models with Image",
description=textwrap.dedent(
"""
For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), here is an example of how the text completion format looks with an image,
"""
),
dialogs=[
TextCompletionContent(
content=[
RawMediaItem(data=img),
RawTextItem(text="If I had to write a haiku for this one"),
]
),
],
notes="- Note the placement of the special tags <|begin_of_text|> and <|image|>",
),
]