From 37875a1985c99f688afca1824c8b1590413c9b3a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 17 Jul 2025 19:55:44 +0000 Subject: [PATCH] Fix DPOAlignmentConfig schema to use correct DPO parameters - Replace incorrect PPO-like parameters (reward_scale, reward_clip, epsilon, gamma) - Add proper DPO parameters: beta (KL coefficient) and loss_type - Update spec to reflect the correct schema --- docs/_static/llama-stack-spec.yaml | 22 +++++++++---------- .../apis/post_training/post_training.py | 6 ++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 29ba9dede..16d5dd41a 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10111,20 +10111,20 @@ components: DPOAlignmentConfig: type: object properties: - reward_scale: - type: number - reward_clip: - type: number - epsilon: - type: number - gamma: + beta: type: number + loss_type: + type: string + enum: + - sigmoid + - hinge + - ipo + - kto_pair + default: sigmoid additionalProperties: false required: - - reward_scale - - reward_clip - - epsilon - - gamma + - beta + - loss_type title: DPOAlignmentConfig DataConfig: type: object diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index b196c8a17..ce6448951 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -106,10 +106,8 @@ class RLHFAlgorithm(Enum): @json_schema_type class DPOAlignmentConfig(BaseModel): - reward_scale: float - reward_clip: float - epsilon: float - gamma: float + beta: float + loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid" @json_schema_type