From 874b1cb00f5d3190b2e23a1edaca95b59ee98320 Mon Sep 17 00:00:00 2001 From: Nehanth Narendrula Date: Fri, 18 Jul 2025 14:56:00 -0400 Subject: [PATCH] fix: DPOAlignmentConfig schema to use correct DPO parameters (#2804) # What does this PR do? This PR fixes the `DPOAlignmentConfig` schema to use the correct Direct Preference Optimization (DPO) parameters. The current schema incorrectly uses PPO-inspired parameters (`reward_scale`, `reward_clip`, `epsilon`, `gamma`) that are not part of the DPO algorithm. This PR updates it to use the standard DPO parameters: - `beta`: The KL divergence coefficient that controls deviation from the reference model - `loss_type`: The type of DPO loss function (sigmoid, hinge, ipo, kto_pair) These parameters align with standard DPO implementations like HuggingFace's TRL library. --------- Co-authored-by: Ubuntu --- docs/_static/llama-stack-spec.html | 29 ++++++++++--------- docs/_static/llama-stack-spec.yaml | 25 +++++++++------- .../apis/post_training/post_training.py | 14 ++++++--- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index db5c57821..d7801ba1c 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -14470,28 +14470,31 @@ "DPOAlignmentConfig": { "type": "object", "properties": { - "reward_scale": { + "beta": { "type": "number" }, - "reward_clip": { - "type": "number" - }, - "epsilon": { - "type": "number" - }, - "gamma": { - "type": "number" + "loss_type": { + "$ref": "#/components/schemas/DPOLossType", + "default": "sigmoid" } }, "additionalProperties": false, "required": [ - "reward_scale", - "reward_clip", - "epsilon", - "gamma" + "beta", + "loss_type" ], "title": "DPOAlignmentConfig" }, + "DPOLossType": { + "type": "string", + "enum": [ + "sigmoid", + "hinge", + "ipo", + "kto_pair" + ], + "title": "DPOLossType" + }, "DataConfig": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 29ba9dede..be02e1e42 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10111,21 +10111,24 @@ components: DPOAlignmentConfig: type: object properties: - reward_scale: - type: number - reward_clip: - type: number - epsilon: - type: number - gamma: + beta: type: number + loss_type: + $ref: '#/components/schemas/DPOLossType' + default: sigmoid additionalProperties: false required: - - reward_scale - - reward_clip - - epsilon - - gamma + - beta + - loss_type title: DPOAlignmentConfig + DPOLossType: + type: string + enum: + - sigmoid + - hinge + - ipo + - kto_pair + title: DPOLossType DataConfig: type: object properties: diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index b196c8a17..f6860ea4b 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -104,12 +104,18 @@ class RLHFAlgorithm(Enum): dpo = "dpo" +@json_schema_type +class DPOLossType(Enum): + sigmoid = "sigmoid" + hinge = "hinge" + ipo = "ipo" + kto_pair = "kto_pair" + + @json_schema_type class DPOAlignmentConfig(BaseModel): - reward_scale: float - reward_clip: float - epsilon: float - gamma: float + beta: float + loss_type: DPOLossType = DPOLossType.sigmoid @json_schema_type