fix: DPOAlignmentConfig schema to use correct DPO parameters (#2804)

# What does this PR do? This PR fixes the `DPOAlignmentConfig` schema to use the correct Direct Preference Optimization (DPO) parameters. The current schema incorrectly uses PPO-inspired parameters (`reward_scale`, `reward_clip`, `epsilon`, `gamma`) that are not part of the DPO algorithm. This PR updates it to use the standard DPO parameters: - `beta`: The KL divergence coefficient that controls deviation from the reference model - `loss_type`: The type of DPO loss function (sigmoid, hinge, ipo, kto_pair) These parameters align with standard DPO implementations like HuggingFace's TRL library. --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-43-83.ec2.internal>
2025-12-03 09:53:45 +00:00 · 2025-07-18 14:56:00 -04:00 · 2025-07-18 14:56:00 -04:00 · 874b1cb00f
commit 874b1cb00f
parent d994305f0a
3 changed files with 40 additions and 28 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -14470,28 +14470,31 @@
            "DPOAlignmentConfig": {
                "type": "object",
                "properties": {
-                    "reward_scale": {
+                    "beta": {
                        "type": "number"
                    },
-                    "reward_clip": {
+                    "loss_type": {
-                        "type": "number"
+                        "$ref": "#/components/schemas/DPOLossType",
-                    },
+                        "default": "sigmoid"
                    "epsilon": {
                        "type": "number"
                    },
                    "gamma": {
                        "type": "number"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "reward_scale",
+                    "beta",
-                    "reward_clip",
+                    "loss_type"
                    "epsilon",
                    "gamma"
                ],
                "title": "DPOAlignmentConfig"
            },
            "DPOLossType": {
                "type": "string",
                "enum": [
                    "sigmoid",
                    "hinge",
                    "ipo",
                    "kto_pair"
                ],
                "title": "DPOLossType"
            },
            "DataConfig": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -10111,21 +10111,24 @@ components:
    DPOAlignmentConfig:
      type: object
      properties:
-        reward_scale:
+        beta:
          type: number
        reward_clip:
          type: number
        epsilon:
          type: number
        gamma:
          type: number
        loss_type:
          $ref: '#/components/schemas/DPOLossType'
          default: sigmoid
      additionalProperties: false
      required:
-        - reward_scale
+        - beta
-        - reward_clip
+        - loss_type
        - epsilon
        - gamma
      title: DPOAlignmentConfig
    DPOLossType:
      type: string
      enum:
        - sigmoid
        - hinge
        - ipo
        - kto_pair
      title: DPOLossType
    DataConfig:
      type: object
      properties:
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -104,12 +104,18 @@ class RLHFAlgorithm(Enum):
    dpo = "dpo"
@json_schema_type
 class DPOLossType(Enum):
    sigmoid = "sigmoid"
    hinge = "hinge"
    ipo = "ipo"
    kto_pair = "kto_pair"
@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
+    beta: float
-    reward_clip: float
+    loss_type: DPOLossType = DPOLossType.sigmoid
    epsilon: float
    gamma: float
@json_schema_type