Fix DPOAlignmentConfig schema to use correct DPO parameters

- Replace incorrect PPO-like parameters (reward_scale, reward_clip, epsilon, gamma) - Add proper DPO parameters: beta (KL coefficient) and loss_type - Update spec to reflect the correct schema
2025-12-22 22:39:41 +00:00 · 2025-07-17 19:55:44 +00:00 · 2025-07-17 19:55:44 +00:00 · 37875a1985
commit 37875a1985
parent 477bcd4d09
2 changed files with 13 additions and 15 deletions
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -10111,20 +10111,20 @@ components:
    DPOAlignmentConfig:
      type: object
      properties:
-        reward_scale:
+        beta:
          type: number
        reward_clip:
          type: number
        epsilon:
          type: number
        gamma:
          type: number
        loss_type:
          type: string
          enum:
            - sigmoid
            - hinge
            - ipo
            - kto_pair
          default: sigmoid
      additionalProperties: false
      required:
-        - reward_scale
+        - beta
-        - reward_clip
+        - loss_type
        - epsilon
        - gamma
      title: DPOAlignmentConfig
    DataConfig:
      type: object
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -106,10 +106,8 @@ class RLHFAlgorithm(Enum):
@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
+    beta: float
-    reward_clip: float
+    loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid"
    epsilon: float
    gamma: float
@json_schema_type