Added hadamard transform for spinquant

2025-07-29 15:23:51 +00:00 · 2024-10-25 11:48:24 -07:00 · 2024-10-25 11:48:24 -07:00 · 93472042f8
commit 93472042f8
parent afae4e3d8e
2 changed files with 103 additions and 2 deletions
--- a/llama_stack/providers/impls/meta_reference/inference/generation.py
+++ b/llama_stack/providers/impls/meta_reference/inference/generation.py
@ -35,12 +35,11 @@ from termcolor import cprint
 from llama_stack.apis.inference import *  # noqa: F403
 from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.providers.utils.inference.prompt_adapter import (
    chat_completion_request_to_messages,
 )
 from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
 from .config import (
    Fp8QuantizationConfig,
@ -159,6 +158,16 @@ class Llama:
                model = Transformer(model_args)
                model = convert_to_int4_quantized_model(model, model_args, config)
                model.load_state_dict(state_dict, strict=True)
                if config.quantization.spinquant:
                    # Add a wrapper for adding hadamard transform for spinquant.
                    # This needs to be done after loading the state dict otherwise an error will be raised while
                    # loading the state dict.
                    from .quantization.hadamard_utils import (
                        add_hadamard_transform_for_spinquant,
                    )
                    add_hadamard_transform_for_spinquant(model)
            else:
                raise NotImplementedError(
                    "Currently int4 and fp8 are the only supported quantization methods."
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/hadamard_utils.py
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/hadamard_utils.py
@ -0,0 +1,92 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import math
 import re
 import torch
 from torch import nn
 def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
    """Hadamard transform.
    This function performs the Hadamard transform on the input tensor 'x'.
    The Hadamard transform is a linear transformation that multiplies the input
    tensor by the Hadamard matrix of dimension n x n, where n is the size of
    the last dimension of the input tensor.
    """
    *_, n = x.shape
    m = int(math.log2(n))
    assert n == 1 << m, "n must be a power of 2"
    x = x[..., None]
    inv_sqrt2 = 0.5**0.5
    for _ in range(m):
        top = x[..., ::2, :] + x[..., 1::2, :]
        bot = x[..., ::2, :] - x[..., 1::2, :]
        x = torch.cat((top, bot), dim=-1)
        x *= inv_sqrt2
    res = x.squeeze(-2)
    return res
 class HadamardModule(torch.nn.Module):
    """A module that applies the Hadamard transform to the input tensor.
    Args:
        group_size: The size of the groups that the input tensor will be divided into
            before applying the Hadamard transform.
    """
    def __init__(self, group_size: int) -> None:
        super().__init__()
        self.group_size = group_size
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        reshape_back = False
        orig_shape = x.shape
        if self.group_size != x.shape[-1]:
            reshape_back = True
            x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
        x = hadamard_transform(x)
        if reshape_back:
            x = x.reshape(orig_shape)
        return x
 def add_hadamard_transform_for_spinquant(
    model: torch.nn.Module, prefix: str = ""
 ) -> None:
    """
    Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
    This function recursively traverses the model's children and looks for layers that match the pattern
    "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
    it is replaced with a new sequential module that consists of a HadamardModule followed by the original
    layer. The HadamardModule applies the Hadamard transform to the input tensor.
    See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
    Args:
        model: An instance of 'torch.nn.Module' (e.g., Transformer model).
        prefix: A string prefix to add to the full name of each child module.
    Returns:
        None
    """
    pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
    for module_name, module in model.named_children():
        child_full_name = prefix + "." + module_name
        if re.search(pattern_last_linear_ffn, child_full_name):
            new_module = nn.Sequential(
                HadamardModule(group_size=module.in_features), module
            )
            del module
            setattr(model, module_name, new_module)
        else:
            add_hadamard_transform_for_spinquant(
                module, (prefix + "." if prefix else prefix) + module_name
            )