Fix DPOAlignmentConfig schema to use correct DPO parameters

- Replace incorrect PPO-like parameters (reward_scale, reward_clip, epsilon, gamma) - Add proper DPO parameters: beta (KL coefficient) and loss_type - Update spec to reflect the correct schema
2025-12-23 00:52:26 +00:00 · 2025-07-17 19:55:44 +00:00 · 2025-07-17 19:55:44 +00:00 · 37875a1985
commit 37875a1985
parent 477bcd4d09
2 changed files with 13 additions and 15 deletions
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -106,10 +106,8 @@ class RLHFAlgorithm(Enum):

@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
+    beta: float
+    loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid"


@json_schema_type