fix: Mirror llama4 rope scaling fixes, small model simplify (#1917)

See: - https://github.com/meta-llama/llama-models/pull/322 - https://github.com/meta-llama/llama-models/pull/320
2025-06-28 02:53:30 +00:00 · 2025-04-09 11:28:45 -07:00 · 2025-04-09 11:28:45 -07:00 · e2299291c4
commit e2299291c4
parent 770b38f8b5
2 changed files with 36 additions and 28 deletions
--- a/llama_stack/models/llama/llama4/args.py
+++ b/llama_stack/models/llama/llama4/args.py
@ -70,6 +70,9 @@ class ModelArgs(BaseModel):
    attention_chunk_size: Optional[int] = None
    rope_theta: float = 500000
    use_scaled_rope: bool = False
+    rope_scaling_factor: Optional[float] = None
+    rope_high_freq_factor: Optional[float] = None
+
    nope_layer_interval: Optional[int] = None  # No position encoding in every n layers
    use_qk_norm: bool = False
    # Set to True to enable inference-time temperature tuning (useful for very long context)
@ -92,4 +95,14 @@ class ModelArgs(BaseModel):
            f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
        )
        assert self.dim % self.n_heads == 0, f"dim ({self.dim}) must be divisible by n_heads ({self.n_heads})"
+
+        if self.use_scaled_rope:
+            # NOTE: ideally these values should have come from params.json. However, we have
+            # shipped the models everywhere. Only Llama-4-Scout uses scaled rope and needs these
+            # specific values.
+            if self.rope_scaling_factor is None:
+                self.rope_scaling_factor = 16
+            if self.rope_high_freq_factor is None:
+                self.rope_high_freq_factor = 1
+
        return self