Fix precommit check after moving to ruff (#927)

Lint check in main branch is failing. This fixes the lint check after we moved to ruff in https://github.com/meta-llama/llama-stack/pull/921. We need to move to a `ruff.toml` file as well as fixing and ignoring some additional checks. Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-02 09:46:45 -05:00 · 2025-02-02 09:46:45 -05:00 · 34ab7a3b6c
commit 34ab7a3b6c
parent 4773092dd1
217 changed files with 981 additions and 2681 deletions
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
@ -63,12 +63,8 @@ def convert_to_fp8_quantized_model(
    # Move weights to GPU with quantization
    if llama_model.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
        log.info("Loading fp8 scales...")
-        fp8_scales_path = os.path.join(
-            checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
-        )
-        assert os.path.isfile(
-            fp8_scales_path
-        ), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
+        fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt")
+        assert os.path.isfile(fp8_scales_path), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
        fp8_scales = torch.load(fp8_scales_path, weights_only=True)

        for block in model.layers:
@ -81,9 +77,7 @@ def convert_to_fp8_quantized_model(
                    param = getattr(block.feed_forward, key)
                    param.weight = load_fp8(
                        param.weight,
-                        fp8_scales[
-                            f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"
-                        ],
+                        fp8_scales[f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"],
                        fp8_activation_scale_ub,
                    )
    else:
@ -172,9 +166,7 @@ class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
        if prefix + "zeros" not in state_dict:
            # Zero-point may not be saved in the state dict. In this case, we assume it's zero.
            assert prefix + "scales" in state_dict
-            state_dict[prefix + "zeros"] = torch.zeros_like(
-                state_dict[prefix + "scales"]
-            )
+            state_dict[prefix + "zeros"] = torch.zeros_like(state_dict[prefix + "scales"])

    def forward(self, input_: torch.Tensor) -> torch.Tensor:
        module_out = super().forward(input_)
@ -229,9 +221,7 @@ class Int8WeightLinear(torch.nn.Linear):
        bias: Whether to use bias.
    """

-    def __init__(
-        self, in_features: int, out_features: int, bias: bool = True, device=None
-    ) -> None:
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None) -> None:
        super().__init__(in_features, out_features, bias, device=device)

        self._register_load_state_dict_pre_hook(self.load_hook)
@ -295,9 +285,7 @@ def _prepare_model_int4_weight_int8_dynamic_activation(
            del module
            setattr(model, module_name, quantized_module)
        else:
-            _prepare_model_int4_weight_int8_dynamic_activation(
-                module, group_size, lora_rank, lora_scale
-            )
+            _prepare_model_int4_weight_int8_dynamic_activation(module, group_size, lora_rank, lora_scale)

    return model

@ -321,9 +309,7 @@ def convert_to_int4_quantized_model(

    group_size = model_args.quantization_args.group_size
    if group_size is None:
-        raise ValueError(
-            "'group_size' cannot be None in 'quantization_args'. Please specify it."
-        )
+        raise ValueError("'group_size' cannot be None in 'quantization_args'. Please specify it.")

    if model_args.lora_args is None:
        # Certain quantized models (e.g., SpinQuant) may not have LoRA.
@ -333,8 +319,6 @@ def convert_to_int4_quantized_model(
        lora_rank = model_args.lora_args.rank
        lora_scale = model_args.lora_args.scale

-    _prepare_model_int4_weight_int8_dynamic_activation(
-        model, group_size, lora_rank, lora_scale
-    )
+    _prepare_model_int4_weight_int8_dynamic_activation(model, group_size, lora_rank, lora_scale)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    return model.to(device)