fix: DPOAlignmentConfig schema to use correct DPO parameters (#2804)

# What does this PR do? This PR fixes the `DPOAlignmentConfig` schema to use the correct Direct Preference Optimization (DPO) parameters. The current schema incorrectly uses PPO-inspired parameters (`reward_scale`, `reward_clip`, `epsilon`, `gamma`) that are not part of the DPO algorithm. This PR updates it to use the standard DPO parameters: - `beta`: The KL divergence coefficient that controls deviation from the reference model - `loss_type`: The type of DPO loss function (sigmoid, hinge, ipo, kto_pair) These parameters align with standard DPO implementations like HuggingFace's TRL library. --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-43-83.ec2.internal>
2025-12-03 09:53:45 +00:00 · 2025-07-18 14:56:00 -04:00 · 2025-07-18 14:56:00 -04:00 · 874b1cb00f
commit 874b1cb00f
parent d994305f0a
3 changed files with 40 additions and 28 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -14470,28 +14470,31 @@
            "DPOAlignmentConfig": {
                "type": "object",
                "properties": {
-                    "reward_scale": {
+                    "beta": {
                        "type": "number"
                    },
-                    "reward_clip": {
-                        "type": "number"
-                    },
-                    "epsilon": {
-                        "type": "number"
-                    },
-                    "gamma": {
-                        "type": "number"
+                    "loss_type": {
+                        "$ref": "#/components/schemas/DPOLossType",
+                        "default": "sigmoid"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "reward_scale",
-                    "reward_clip",
-                    "epsilon",
-                    "gamma"
+                    "beta",
+                    "loss_type"
                ],
                "title": "DPOAlignmentConfig"
            },
+            "DPOLossType": {
+                "type": "string",
+                "enum": [
+                    "sigmoid",
+                    "hinge",
+                    "ipo",
+                    "kto_pair"
+                ],
+                "title": "DPOLossType"
+            },
            "DataConfig": {
                "type": "object",
                "properties": {