forked from phoenix-oss/llama-stack-mirror
# What does this PR do? Move around bits. This makes the copies from llama-models _much_ easier to maintain and ensures we don't entangle meta-reference specific tidbits into llama-models code even by accident. Also, kills the meta-reference-quantized-gpu distro and rolls quantization deps into meta-reference-gpu. ## Test Plan ``` LLAMA_MODELS_DEBUG=1 \ with-proxy llama stack run meta-reference-gpu \ --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct \ --env INFERENCE_CHECKPOINT_DIR=<DIR> \ --env MODEL_PARALLEL_SIZE=4 \ --env QUANTIZATION_TYPE=fp8_mixed ``` Start a server with and without quantization. Point integration tests to it using: ``` pytest -s -v tests/integration/inference/test_text_inference.py \ --stack-config http://localhost:8321 --text-model meta-llama/Llama-4-Scout-17B-16E-Instruct ```
306 lines
14 KiB
Python
306 lines
14 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import textwrap
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
from ..datatypes import RawMediaItem, RawMessage, RawTextItem
|
|
from ..prompt_format import (
|
|
Llama4UseCase,
|
|
TextCompletionContent,
|
|
UseCase,
|
|
)
|
|
|
|
THIS_DIR = Path(__file__).parent
|
|
|
|
|
|
def usecases(base_model: bool = False) -> List[UseCase | str]:
|
|
with open(THIS_DIR.parent / "resources/small_dog.jpg", "rb") as f:
|
|
img_small_dog = f.read()
|
|
with open(THIS_DIR.parent / "resources/dog.jpg", "rb") as f:
|
|
img_dog = f.read()
|
|
with open(THIS_DIR.parent / "resources/pasta.jpeg", "rb") as f:
|
|
img_pasta = f.read()
|
|
out = []
|
|
out.extend(
|
|
[
|
|
textwrap.dedent(
|
|
"""
|
|
# Llama 4 - Prompt Formats
|
|
## Tokens
|
|
Here is a list of special tokens that are supported by Llama 4:
|
|
- `<|begin_of_text|>`: Specifies the start of the prompt
|
|
- `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
|
|
- `<|header_start|>` and `<|header_end|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user and assistant].
|
|
- `<|eot|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
|
|
- at the end of a direct interaction between the model and the user
|
|
- at the end of multiple interactions between the model and any available tools
|
|
This token signals to the executor that the model has finished generating a response.
|
|
- `<|image_start|>` and `<|image_end|>`: These tokens enclose the image data in the prompt.
|
|
- `<|patch|>`: This token represents a piece of the tile/
|
|
- `<|tile_y_separator|>` and `<|tile_x_separator|>`: These tokens are used to separate the y and x tiles of an image
|
|
- `<|image|>`: In the new architecture, this token now separates the regular sized image information from a downsized version of it that fits in a single tile. The longer side is used for calculating the scale factor and the rest is padded to fit the tile.
|
|
"""
|
|
),
|
|
textwrap.dedent(
|
|
"""
|
|
There are 3 different roles that are supported by Llama 4
|
|
- `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
|
|
- `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
|
|
- `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `tool` and `user` prompts.
|
|
"""
|
|
),
|
|
]
|
|
)
|
|
|
|
if base_model:
|
|
out.extend(
|
|
[
|
|
"# Llama 4 Base Model",
|
|
Llama4UseCase(
|
|
title="Text completion - Paris information",
|
|
description="Text completion for Llama 4 base model uses this format.",
|
|
dialogs=[TextCompletionContent(content="The capital of France is Paris")],
|
|
),
|
|
Llama4UseCase(
|
|
title="Text completion - The color of the sky",
|
|
description="Text completion for Llama 4 base model uses this format.",
|
|
dialogs=[
|
|
TextCompletionContent(content="The color of the sky is blue but sometimes it can also be")
|
|
],
|
|
notes="",
|
|
),
|
|
Llama4UseCase(
|
|
title="Text completion - Translation example",
|
|
description="Text completion for Llama 4 base model uses this format.",
|
|
dialogs=[
|
|
TextCompletionContent(
|
|
content="""apple is pomme,
|
|
bannana is banane,
|
|
cherry is"""
|
|
)
|
|
],
|
|
notes="",
|
|
),
|
|
]
|
|
)
|
|
|
|
out.extend(
|
|
[
|
|
"# Llama 4 Instruct Model",
|
|
Llama4UseCase(
|
|
title="Simple User and assistant conversation",
|
|
description="Here is a regular multi-turn user assistant conversation and how its formatted.",
|
|
dialogs=[
|
|
[
|
|
RawMessage(role="system", content="You are a helpful assistant"),
|
|
RawMessage(
|
|
role="user",
|
|
content="Answer who are you in the form of jeopardy?",
|
|
),
|
|
]
|
|
],
|
|
notes="",
|
|
max_gen_len=512,
|
|
),
|
|
"# Image prompt format",
|
|
Llama4UseCase(
|
|
title="Single image prompt format - small image",
|
|
description="This example passes an image that is smaller than the tile size, to show the tile separator tokens are not needed",
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="user",
|
|
content=[
|
|
RawMediaItem(data=BytesIO(img_small_dog)),
|
|
RawTextItem(text="Describe this image in two sentences"),
|
|
],
|
|
)
|
|
]
|
|
],
|
|
notes="""Notice the structure of the image section:
|
|
```
|
|
<|image_start|><|image|><|patch|>...<|patch|><|image_end|>
|
|
```
|
|
This is due to the image being smaller than the tile size.
|
|
""",
|
|
max_gen_len=512,
|
|
),
|
|
Llama4UseCase(
|
|
title="Single image prompt format",
|
|
description="Here is an example of how to pass an image to the model",
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="user",
|
|
content=[
|
|
RawMediaItem(data=BytesIO(img_dog)),
|
|
RawTextItem(text="Describe this image in two sentences"),
|
|
],
|
|
)
|
|
]
|
|
],
|
|
notes="""With a bigger image, the image will include the tile separator tokens. Additionally, the image tag now separates a scaled down version of the image from the regular sized image.
|
|
```
|
|
<|image_start|><|patch|>...<|patch|><|tile_x_separator|><|patch|>...<|patch|><|tile_y_separator|><|patch|>...<|patch|><|image|><|patch|>...<|patch|><|image_end|>
|
|
```
|
|
""",
|
|
max_gen_len=1024,
|
|
),
|
|
Llama4UseCase(
|
|
title="Multiple images prompt format",
|
|
description="Here is an example of how to pass an image to the model",
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="user",
|
|
content=[
|
|
RawMediaItem(data=BytesIO(img_dog)),
|
|
RawMediaItem(data=BytesIO(img_pasta)),
|
|
RawTextItem(text="Describe these images in two sentences"),
|
|
],
|
|
)
|
|
]
|
|
],
|
|
notes="With multiple images, each one is encapsulated in their corresponding image tags.",
|
|
max_gen_len=4096,
|
|
),
|
|
"# Tool calling\nWe are continuing the format for zero shot function calling used in previous versions of Llama. All available functions can be provided either in the system message or in the user message.",
|
|
Llama4UseCase(
|
|
title="Zero shot function calling - system message",
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="system",
|
|
content="""You are an expert in composing functions. You are given a question and a set of possible functions.
|
|
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
|
If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
|
|
also point it out. You should only return the function call in tools call sections.
|
|
|
|
If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
|
|
You SHOULD NOT include any other text in the response.
|
|
|
|
Here is a list of functions in JSON format that you can invoke.
|
|
|
|
[
|
|
{
|
|
"name": "get_weather",
|
|
"description": "Get weather info for places",
|
|
"parameters": {
|
|
"type": "dict",
|
|
"required": [
|
|
"city"
|
|
],
|
|
"properties": {
|
|
"city": {
|
|
"type": "string",
|
|
"description": "The name of the city to get the weather for"
|
|
},
|
|
"metric": {
|
|
"type": "string",
|
|
"description": "The metric for weather. Options are: celsius, fahrenheit",
|
|
"default": "celsius"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""",
|
|
),
|
|
RawMessage(
|
|
role="user",
|
|
content="What is the weather in SF and Seattle?",
|
|
),
|
|
]
|
|
],
|
|
notes=textwrap.dedent(
|
|
"""
|
|
- The output supports multiple, and parallel tool calls natively
|
|
- JSON format for defining the functions in the system prompt is similar to Llama3.1
|
|
"""
|
|
),
|
|
),
|
|
Llama4UseCase(
|
|
title="Zero shot function calling - user message",
|
|
description=textwrap.dedent(
|
|
"""
|
|
Similar to the above example, you can also provide information for all the available tools in the user message.
|
|
"""
|
|
),
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="user",
|
|
content="""Questions: Can you retrieve the details for the user with the ID 7890, who has black as their special request?
|
|
Here is a list of functions in JSON format that you can invoke:
|
|
[
|
|
{
|
|
"name": "get_user_info",
|
|
"description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
|
|
"parameters": {
|
|
"type": "dict",
|
|
"required": [
|
|
"user_id"
|
|
],
|
|
"properties": {
|
|
"user_id": {
|
|
"type": "integer",
|
|
"description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
|
|
},
|
|
"special": {
|
|
"type": "string",
|
|
"description": "Any special information or parameters that need to be considered while fetching user details.",
|
|
"default": "none"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
]
|
|
|
|
Should you decide to return the function call(s), put them in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]
|
|
|
|
You SHOULD NOT include any other text in the response.""",
|
|
),
|
|
]
|
|
],
|
|
notes=textwrap.dedent(
|
|
"""
|
|
- The tool call format for the model is the same whether your function calls are provided in the system or user message.
|
|
"""
|
|
),
|
|
),
|
|
Llama4UseCase(
|
|
title="Tool calling with custom formats",
|
|
description=textwrap.dedent(
|
|
"""
|
|
Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
|
|
In this example, we define a custom tool calling format using the `<function>` tag.
|
|
"""
|
|
),
|
|
dialogs=[
|
|
[
|
|
RawMessage(
|
|
role="user",
|
|
content="""You have access to the following functions:\nUse the function 'trending_songs' to 'Returns the trending songs on a Music site':\n{"name": "trending_songs", "description": "Returns the trending songs on a Music site", "parameters": {"genre": {"description": "The genre of the songs to return", "param_type": "str", "required": false}, "n": {"description": "The number of songs to return", "param_type": "int", "required": true}}}\n\nThink very carefully before calling functions.\nIf you choose to call a function ONLY reply in the following format with no prefix or suffix:\n\n<function=example_function_name>{"example_name": "example_value"}</function>
|
|
Reminder:
|
|
- If looking for real time information use relevant functions before falling back to brave_search
|
|
- Function calls MUST follow the specified format, start with <function= and end with </function>
|
|
- Required parameters MUST be specified
|
|
- Only call one function at a time
|
|
- Put the entire function call reply on one line<|eot_id|>""",
|
|
),
|
|
RawMessage(
|
|
role="user",
|
|
content="Use tools to get latest trending songs",
|
|
),
|
|
]
|
|
],
|
|
),
|
|
]
|
|
)
|
|
|
|
return out
|