feat: introduce llama4 support (#1877)

As title says. Details in README, elsewhere.
This commit is contained in:
Ashwin Bharambe 2025-04-05 11:53:35 -07:00 committed by GitHub
parent 23a99a4b22
commit b8f1561956
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
61 changed files with 205222 additions and 6439 deletions

View file

@ -0,0 +1,216 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# top-level folder for each specific model found within the models/ directory at
# the top-level of this source tree.
import math
from typing import Any, Callable, Dict, List
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
from ..args import VisionArgs
from .encoder import VisionEncoder
class PixelShuffle(nn.Module):
def __init__(self, ps_ratio):
super().__init__()
self.ps_ratio = ps_ratio
def forward(self, x):
# x: [B, N, C], N = number of patches
assert self.ps_ratio is not None, "ps_ratio is required for pixel shuffle"
assert x.dim() == 3, "pixel shuffle requires encoded patches [B, N, C]"
hh = ww = int(math.sqrt(x.shape[1]))
x = x.reshape(x.shape[0], hh, ww, -1)
x = pixel_shuffle_op(x, ps_ratio=self.ps_ratio)
pixel_shuffle_patches = x.reshape(x.shape[0], -1, x.shape[-1])
return pixel_shuffle_patches
def pixel_shuffle_op(input_x, ps_ratio):
n, w, h, c = input_x.size()
input_x = input_x.view(n, w, int(h * ps_ratio), int(c / ps_ratio))
input_x = input_x.permute(0, 2, 1, 3).contiguous()
input_x = input_x.view(
n,
int(h * ps_ratio),
int(w * ps_ratio),
int(c / (ps_ratio * ps_ratio)),
)
input_x = input_x.permute(0, 2, 1, 3).contiguous()
return input_x
class SimpleMLP(torch.nn.Module):
def __init__(
self,
dim: int,
hidden_dim: int,
bias: bool = True,
dropout: float = 0.0,
act_layer: Callable = nn.GELU,
):
super().__init__()
# layers
self.c_fc = ColumnParallelLinear(
dim,
hidden_dim,
bias=bias,
gather_output=False,
)
self.c_proj = RowParallelLinear(
hidden_dim,
hidden_dim,
bias=bias,
input_is_parallel=True,
)
self.non_linearity = act_layer()
self.dropout = dropout
def forward(self, x):
hidden = self.c_fc(x)
hidden = self.non_linearity(hidden)
hidden = F.dropout(hidden, p=self.dropout, training=self.training)
return self.non_linearity(self.c_proj(hidden))
class PixelShuffleMLP(torch.nn.Module):
def __init__(
self,
ps_ratio: float,
input_dim: int,
output_dim: int = 4096,
add_fc: bool = False,
):
super().__init__()
self.pixel_shuffle = PixelShuffle(ps_ratio)
self.mlp = SimpleMLP(
int(input_dim // (ps_ratio**2)),
output_dim,
bias=False,
dropout=0.0,
act_layer=nn.GELU,
)
self.fc = nn.Identity()
if add_fc:
self.fc = ColumnParallelLinear(
output_dim,
output_dim,
bias=False,
)
def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
encoded_patches = self.pixel_shuffle(encoded_patches)
return self.fc(self.mlp(encoded_patches))
class VisionEmbeddings(torch.nn.Module):
def __init__(self, args: VisionArgs):
super().__init__()
self.args = args
image_size = args.image_size
patch_size = args.patch_size
self.vision_encoder = VisionEncoder(
image_size=(image_size.height, image_size.width),
patch_size=(patch_size.height, patch_size.width),
dim=args.dim,
layers=args.n_layers,
heads=args.n_heads,
mlp_ratio=args.mlp_ratio,
)
self.vision_encoder = self.vision_encoder.to(torch.bfloat16)
self.vision_adapter = PixelShuffleMLP(
ps_ratio=args.pixel_shuffle_ratio,
input_dim=args.dim,
output_dim=args.output_dim,
)
self.output_dim = args.output_dim
self._register_load_state_dict_pre_hook(self.load_hook)
def load_hook(
self,
state_dict: Dict[str, Any],
prefix: str,
local_metadata: Dict[str, Any],
strict: bool = True,
missing_keys: List[str] = None,
unexpected_keys: List[str] = None,
error_msgs: List[str] = None,
return_state_dict: bool = False,
) -> None:
original_sd = self.state_dict()
for k in state_dict:
if k.startswith(prefix) and len(state_dict[k].shape) == 1 and state_dict[k].shape[0] == 0:
state_dict[k] = state_dict[k].reshape(original_sd[k[len(prefix) :]].shape)
def _get_empty_sequence(self, h):
return torch.zeros(
h.shape[0],
h.shape[1],
self.output_dim,
device=h.device,
dtype=h.dtype,
)
# x_images is batched; each batch sample contains a list of images. so this is List[List[torch.Tensor]]
# each image is a tensor of shape [num_tiles, C, H, W]
def forward(
self,
image_batch: List[List[torch.Tensor]],
image_mask: torch.Tensor,
h_ref: torch.Tensor,
) -> torch.Tensor:
images_flattened = [image for sample in image_batch for image in sample]
images_flattened = torch.vstack(images_flattened).unsqueeze(1).to(h_ref.dtype).to(h_ref.device)
embedding = self.vision_encoder(images_flattened)
projected_embedding = self.vision_adapter(embedding)
h_image = self._get_empty_sequence(h_ref)
return scatter_embeddings(image_batch, image_mask, h_image, projected_embedding)
def scatter_embeddings(image_batch, image_mask, h_image, encoded_patches_proj):
# If dynamic transform is used and the batch contains 2 images (where image_1 has 2 chunks and image_2 has 3 chunks),
# `num_images_per_sequence` now records the number of chunks per image as `[2, 3]`.
# `encoded_patches_proj.split` will then split the image chunks into 2 groups: `[image_1_chunks, image_2_chunks]`.
num_images_per_sequence = [sum(image.size(0) for image in sample_images) for sample_images in image_batch]
assert not torch.isnan(encoded_patches_proj).any()
assert sum(num_images_per_sequence) == encoded_patches_proj.size(0), (
f"{sum(num_images_per_sequence)=} != {encoded_patches_proj.shape=}"
)
encoded_patches_list = encoded_patches_proj.split(num_images_per_sequence, dim=0)
for index in range(h_image.size(0)):
encoded_patches_per_sample = encoded_patches_list[index]
sample_image_mask = image_mask[index]
if encoded_patches_per_sample.numel() == 0:
continue
encoded_patches_per_sample = encoded_patches_per_sample.contiguous().view(
-1, encoded_patches_per_sample.size(-1)
)
n_tokens_to_fill = sample_image_mask.sum()
assert n_tokens_to_fill <= encoded_patches_per_sample.size(0)
h_image[index].masked_scatter_(
sample_image_mask.expand(-1, h_image.size(-1)),
encoded_patches_per_sample[:n_tokens_to_fill],
)
return h_image

View file

@ -0,0 +1,411 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import fairscale.nn.model_parallel.initialize as fs_init
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
from torch import einsum
from ..args import ModelArgs
from ..model import Attention
class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""
def forward(self, x: torch.Tensor):
x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
return x
class ColumnParallelConv2dPatch(torch.nn.Module):
"""Conv2D Patching layer with model parallelism.
Column parallel over unfolded input.
Arguments:
in_channels: Input channels.
out_channels: Output channels.
kernel_size: Size of convolution kernel.
stride (default 1): Stride for convolution.
bias (default False): Use bias in Conv2d.
Input: (bsz, in_channels, height, width)
Output: (bsz, num_tokens, out_channels)
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int]],
stride: Union[int, Tuple[int, int]],
bias: Optional[bool] = False,
) -> None:
super().__init__()
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
self._linear = ColumnParallelLinear(
in_channels * kernel_size[0] * kernel_size[1],
out_channels,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self._unfold(x)
x = x.permute(0, 2, 1)
x = self._linear(x)
return x
class _FeedForward(torch.nn.Module):
def __init__(
self,
dim: int,
hidden_dim: int,
dropout: float,
act_layer: Callable = nn.GELU,
):
super().__init__()
# layers
self.c_fc = ColumnParallelLinear(
dim,
hidden_dim,
bias=True,
gather_output=False,
init_method=lambda x: x,
)
self.c_proj = RowParallelLinear(
hidden_dim,
dim,
bias=True,
input_is_parallel=True,
init_method=lambda x: x,
)
self.non_linearity = act_layer()
self.dropout = dropout
def forward(self, x):
hidden = self.c_fc(x)
hidden = self.non_linearity(hidden)
hidden = F.dropout(hidden, p=self.dropout, training=self.training)
return self.c_proj(hidden)
class _TransformerBlock(nn.Module):
def __init__(
self,
d_model: int,
n_head: int,
mlp_ratio: float = 4.0,
act_layer: Callable = nn.GELU,
gated: bool = False,
):
super().__init__()
assert d_model % n_head == 0
self.n_heads = n_head
self.head_dim = d_model // self.n_heads
attn_args = ModelArgs(
dim=d_model,
head_dim=self.head_dim,
n_heads=self.n_heads,
n_kv_heads=self.n_heads,
)
self.attn = Attention(attn_args, use_rope=True, use_qk_norm=False, add_bias=True)
self.ln_1 = LayerNorm(d_model)
self.mlp = _FeedForward(
dim=d_model,
hidden_dim=int(mlp_ratio * d_model),
dropout=0.0,
act_layer=act_layer,
)
self.ln_2 = LayerNorm(d_model)
self.gated = gated
if gated:
self.gate_attn = nn.Parameter(torch.zeros(1))
self.gate_ffn = nn.Parameter(torch.zeros(1))
def attention(
self,
x: torch.Tensor,
freq_cis: Optional[torch.Tensor] = None,
):
return self.attn(x=x, start_pos=0, freqs_cis=freq_cis)
def forward(
self,
x: torch.Tensor,
mask: Optional[torch.Tensor] = None,
freq_cis: Optional[torch.Tensor] = None,
):
_gate_attn = 1 if not self.gated else self.gate_attn.tanh()
_gate_ffn = 1 if not self.gated else self.gate_ffn.tanh()
x = x + _gate_attn * self.attention(self.ln_1(x), freq_cis=freq_cis)
x = x + _gate_ffn * self.mlp(self.ln_2(x))
return x
class _Transformer(nn.Module):
def __init__(
self,
dim: int,
layers: int,
heads: int,
mlp_ratio: float = 4.0,
act_layer: Callable = nn.GELU,
gated: bool = False,
):
super().__init__()
self.resblocks = nn.ModuleList(
[
_TransformerBlock(
d_model=dim,
n_head=heads,
mlp_ratio=mlp_ratio,
act_layer=act_layer,
gated=gated,
)
for _ in range(layers)
]
)
def forward(self, x: torch.Tensor, return_intermediate=None, mask=None, freq_cis=None):
out = []
for idx, r in enumerate(self.resblocks):
if return_intermediate is not None and idx in return_intermediate:
out.append(x)
x = r(x, mask=mask, freq_cis=freq_cis)
if return_intermediate is not None:
return x, torch.stack(out, dim=-1)
return x
class PackingIndex:
Z = 0 # Z (time) coordinate of the token in the original sample
Y = 1 # Y (height) coordinate of the token in the original sample
X = 2 # X (width) coordinate of the token in the original sample
TIME = 3 # Total number of time units (frames) in the original sample
HEIGHT = 4 # Height of the original sample
WIDTH = 5 # Width of the original sample
# USE INDEX TO CHECK THE TYPE OF THE TOKEN (see ID fields below)
IDX = 6 # Full index of the token in the original sample (x + y * w + z * w * h)
BATCH_IDX = 7 # Which batch element this token belongs to. Note the batch idx of padding tokens is BATCH_SIZE
# Total size of the enum, remember to update this!
NUM_METADATA = 8
# Note: For padding tokens IDX = -1
# For cls tokens, IDX = -2
ID_CLS_TOKEN = -2
ID_PAD_TOKEN = -1
class VisionEncoder(nn.Module):
def __init__(
self,
image_size: Tuple[int, int],
patch_size: Tuple[int, int],
dim: int,
layers: int,
heads: int,
mlp_ratio: float,
in_channels: int = 3,
):
super().__init__()
self.image_size = image_size
self.patch_size = patch_size
self.grid_size = (
self.image_size[0] // self.patch_size[0],
self.image_size[1] // self.patch_size[1],
)
self.conv1 = ColumnParallelConv2dPatch(
in_channels=in_channels,
out_channels=dim,
kernel_size=patch_size,
stride=patch_size,
bias=False,
)
scale = dim**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(dim))
self.positional_embedding_vlm = nn.Parameter(
scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, dim)
)
self.ln_pre = LayerNorm(dim)
self.ln_post = LayerNorm(dim)
self.transformer = _Transformer(
dim,
layers,
heads,
mlp_ratio,
act_layer=nn.GELU,
)
# NOTE: hack for the fixed res
image_h, image_w = self.image_size
patch_h, patch_w = self.patch_size
idx_h, idx_w = image_h // patch_h, image_w // patch_w
img_idx = torch.arange(image_h * image_w // (patch_h * patch_w), dtype=torch.int32)
img_idx = img_idx.reshape(idx_h * idx_w, 1)
img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
img_idx[-1, -1] = PackingIndex.ID_CLS_TOKEN
packed_img_idx = torch.empty(
img_idx.shape[0],
img_idx.shape[1],
PackingIndex.NUM_METADATA - 1,
dtype=torch.int32,
)
packed_img_idx[:, :, PackingIndex.Y] = img_idx // idx_w
packed_img_idx[:, :, PackingIndex.X] = img_idx % idx_w
packed_img_idx[:, :, PackingIndex.HEIGHT].fill_(idx_h)
packed_img_idx[:, :, PackingIndex.WIDTH].fill_(idx_w)
packed_img_idx[:, :, PackingIndex.IDX] = img_idx
packed_img_idx = packed_img_idx.reshape(1, -1, PackingIndex.NUM_METADATA - 1)
self.packed_img_idx = packed_img_idx # for positional embedding load hook
# compute rope freqs
rope_freq = self.get_rope_freqs(dim // heads // 2)
freqs_x = self.compute_rope_freqs(rope_freq, packed_img_idx[:, :, PackingIndex.X] + 1)
freqs_y = self.compute_rope_freqs(rope_freq, packed_img_idx[:, :, PackingIndex.Y] + 1)
freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
# disable RoPE for padding and cls tokens
freqs = freqs.masked_fill(packed_img_idx[:, :, PackingIndex.IDX, None] < 0, 0)
# compute complex freqs
self.freq_cis = torch.view_as_complex(torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
# xlf automatically broadcasts
self.freq_cis = self.freq_cis.squeeze(0)
self.n_heads = heads // fs_init.get_model_parallel_world_size()
self._register_load_state_dict_pre_hook(self.load_hook)
def get_rope_freqs(self, dim, theta=10000):
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
return freqs
@torch.amp.autocast("cuda", enabled=False)
def compute_rope_freqs(self, freqs, t):
freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
freqs = freqs.repeat_interleave(2, dim=-1)
return freqs
def load_hook(
self,
state_dict: Dict[str, Any],
prefix: str,
local_metadata: Dict[str, Any],
strict: bool = True,
missing_keys: List[str] = None,
unexpected_keys: List[str] = None,
error_msgs: List[str] = None,
return_state_dict: bool = False,
) -> None:
orig_pos_embed = state_dict.get(prefix + "positional_embedding")
if orig_pos_embed is not None and orig_pos_embed.shape[-2:] != self.positional_embedding_vlm.shape[-2:]:
raise ValueError(
f"Positional embedding shape {orig_pos_embed.shape} does not match expected shape {self.positional_embedding_vlm.shape}"
)
batch_size, token_per_image, _ = self.packed_img_idx.shape
# Input points for idx are [x, y, w, h]
idx = self.packed_img_idx.reshape(batch_size * token_per_image, 1, -1)
total_windows, window_size, _ = idx.shape
# Grid values are [-1, 1] and coords are w, h
grid = (
(idx[:, :, [PackingIndex.X, PackingIndex.Y]] / idx[:, :, [PackingIndex.WIDTH, PackingIndex.HEIGHT]]) * 2 - 1
)[None, ...]
# In this mode, cls token has no position embedding
if orig_pos_embed is not None:
posemb = (
orig_pos_embed[1:].view(1, self.grid_size[0], self.grid_size[1], -1).permute(0, 3, 1, 2).contiguous()
)
posemb = posemb.to(device=grid.device, dtype=grid.dtype)
sample = F.grid_sample(
posemb, grid, padding_mode="zeros"
) # padding tokens / class token will get zero for posemb
sample = sample.view(-1, total_windows, window_size).permute(1, 2, 0).contiguous()
sample = torch.where(
idx[:, :, PackingIndex.IDX, None] == PackingIndex.ID_CLS_TOKEN,
orig_pos_embed[0].view(1, 1, -1).to(device=sample.device, dtype=sample.dtype),
sample,
)
new_pos_embed = sample.reshape(batch_size, token_per_image, -1)
state_dict[prefix + "positional_embedding_vlm"] = new_pos_embed.squeeze(0)
if return_state_dict:
return state_dict
def apply_class_embedding(self, x):
x = torch.cat(
[
x,
self.class_embedding.to(x.dtype)
+ torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
],
dim=1,
) # shape = [*, grid ** 2 + 1, width]
return x
def forward(self, images: torch.Tensor) -> torch.Tensor:
# NOTE: in Llama4 bsz=bsz*num_tiles, num_chunks=1
if images.ndim == 5:
num_concurrent_media = 1
bsz, num_chunks, nch, h, w = images.shape
else:
bsz, num_concurrent_media, num_chunks, nch, h, w = images.shape
images = images.reshape(bsz * num_concurrent_media * num_chunks, nch, h, w)
# patch embedding
x = images.reshape(bsz * num_concurrent_media * num_chunks, nch, h, w)
x = self.conv1(x) # shape = [*, width, grid ** 2]
_, ntok, dim = x.shape
x = x.reshape(bsz * num_concurrent_media * num_chunks, ntok, dim)
# apply cls token
x = self.apply_class_embedding(x)
ntok += 1
# apply position embeddings
if self.positional_embedding_vlm is not None:
x = x + self.positional_embedding_vlm.to(x.dtype)
x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok, dim)
x = self.ln_pre(x)
x = x.view(bsz * num_concurrent_media, -1, dim)
freq_cis = self.freq_cis.to(images.device)
tf_output = self.transformer(
x,
freq_cis=freq_cis,
)
int_x = None
if isinstance(tf_output, tuple):
x, int_x = tf_output
else:
x = tf_output
x = self.ln_post(x)
# remove cls token output
x = x[:, :-1, :]
# add and output x + int_x features
if int_x is not None:
int_x = int_x[:, :-1, :, :]
int_x = int_x.reshape(bsz * num_concurrent_media, ntok - 1, -1)
x = torch.cat([x, int_x], dim=-1)
return x