From 874b1cb00f5d3190b2e23a1edaca95b59ee98320 Mon Sep 17 00:00:00 2001
From: Nehanth Narendrula <nehanthnarendrula@gmail.com>
Date: Fri, 18 Jul 2025 14:56:00 -0400
Subject: [PATCH] fix: DPOAlignmentConfig schema to use correct DPO parameters
 (#2804)

# What does this PR do?

This PR fixes the `DPOAlignmentConfig` schema to use the correct Direct
Preference Optimization (DPO) parameters.

The current schema incorrectly uses PPO-inspired parameters
(`reward_scale`, `reward_clip`, `epsilon`, `gamma`) that are not part of
the DPO algorithm. This PR updates it to use the standard DPO
parameters:

- `beta`: The KL divergence coefficient that controls deviation from the
reference model
- `loss_type`: The type of DPO loss function (sigmoid, hinge, ipo,
kto_pair)

These parameters align with standard DPO implementations like
HuggingFace's TRL library.

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-43-83.ec2.internal>
---
 docs/_static/llama-stack-spec.html            | 29 ++++++++++---------
 docs/_static/llama-stack-spec.yaml            | 25 +++++++++-------
 .../apis/post_training/post_training.py       | 14 ++++++---
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index db5c57821..d7801ba1c 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -14470,28 +14470,31 @@
             "DPOAlignmentConfig": {
                 "type": "object",
                 "properties": {
-                    "reward_scale": {
+                    "beta": {
                         "type": "number"
                     },
-                    "reward_clip": {
-                        "type": "number"
-                    },
-                    "epsilon": {
-                        "type": "number"
-                    },
-                    "gamma": {
-                        "type": "number"
+                    "loss_type": {
+                        "$ref": "#/components/schemas/DPOLossType",
+                        "default": "sigmoid"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "reward_scale",
-                    "reward_clip",
-                    "epsilon",
-                    "gamma"
+                    "beta",
+                    "loss_type"
                 ],
                 "title": "DPOAlignmentConfig"
             },
+            "DPOLossType": {
+                "type": "string",
+                "enum": [
+                    "sigmoid",
+                    "hinge",
+                    "ipo",
+                    "kto_pair"
+                ],
+                "title": "DPOLossType"
+            },
             "DataConfig": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 29ba9dede..be02e1e42 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10111,21 +10111,24 @@ components:
     DPOAlignmentConfig:
       type: object
       properties:
-        reward_scale:
-          type: number
-        reward_clip:
-          type: number
-        epsilon:
-          type: number
-        gamma:
+        beta:
           type: number
+        loss_type:
+          $ref: '#/components/schemas/DPOLossType'
+          default: sigmoid
       additionalProperties: false
       required:
-        - reward_scale
-        - reward_clip
-        - epsilon
-        - gamma
+        - beta
+        - loss_type
       title: DPOAlignmentConfig
+    DPOLossType:
+      type: string
+      enum:
+        - sigmoid
+        - hinge
+        - ipo
+        - kto_pair
+      title: DPOLossType
     DataConfig:
       type: object
       properties:
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index b196c8a17..f6860ea4b 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -104,12 +104,18 @@ class RLHFAlgorithm(Enum):
     dpo = "dpo"
 
 
+@json_schema_type
+class DPOLossType(Enum):
+    sigmoid = "sigmoid"
+    hinge = "hinge"
+    ipo = "ipo"
+    kto_pair = "kto_pair"
+
+
 @json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
+    beta: float
+    loss_type: DPOLossType = DPOLossType.sigmoid
 
 
 @json_schema_type