Fix DPOAlignmentConfig schema to use correct DPO parameters

- Replace incorrect PPO-like parameters (reward_scale, reward_clip, epsilon, gamma) - Add proper DPO parameters: beta (KL coefficient) and loss_type - Update spec to reflect the correct schema
2025-12-22 20:40:00 +00:00 · 2025-07-17 19:55:44 +00:00 · 2025-07-17 19:55:44 +00:00 · 37875a1985
commit 37875a1985
parent 477bcd4d09
2 changed files with 13 additions and 15 deletions
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -10111,20 +10111,20 @@ components:
    DPOAlignmentConfig:
      type: object
      properties:
-        reward_scale:
-          type: number
-        reward_clip:
-          type: number
-        epsilon:
-          type: number
-        gamma:
+        beta:
          type: number
+        loss_type:
+          type: string
+          enum:
+            - sigmoid
+            - hinge
+            - ipo
+            - kto_pair
+          default: sigmoid
      additionalProperties: false
      required:
-        - reward_scale
-        - reward_clip
-        - epsilon
-        - gamma
+        - beta
+        - loss_type
      title: DPOAlignmentConfig
    DataConfig:
      type: object
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -106,10 +106,8 @@ class RLHFAlgorithm(Enum):

@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
+    beta: float
+    loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid"


@json_schema_type