diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 6a8945bd1..f9af10165 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -15078,22 +15078,6 @@ "DPOAlignmentConfig": { "type": "object", "properties": { - "reward_scale": { - "type": "number", - "description": "Scaling factor for the reward signal" - }, - "reward_clip": { - "type": "number", - "description": "Maximum absolute value for reward clipping" - }, - "epsilon": { - "type": "number", - "description": "Small value added for numerical stability" - }, - "gamma": { - "type": "number", - "description": "Discount factor for future rewards" - }, "beta": { "type": "number", "description": "Temperature parameter for the DPO loss" @@ -15106,10 +15090,6 @@ }, "additionalProperties": false, "required": [ - "reward_scale", - "reward_clip", - "epsilon", - "gamma", "beta", "loss_type" ], diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index f1bb40dc1..d2c41b2bf 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -11163,20 +11163,6 @@ components: DPOAlignmentConfig: type: object properties: - reward_scale: - type: number - description: Scaling factor for the reward signal - reward_clip: - type: number - description: >- - Maximum absolute value for reward clipping - epsilon: - type: number - description: >- - Small value added for numerical stability - gamma: - type: number - description: Discount factor for future rewards beta: type: number description: Temperature parameter for the DPO loss @@ -11186,10 +11172,6 @@ components: description: The type of loss function to use for DPO additionalProperties: false required: - - reward_scale - - reward_clip - - epsilon - - gamma - beta - loss_type title: DPOAlignmentConfig diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index 9170cba51..c16221289 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -193,18 +193,10 @@ class DPOLossType(Enum): class DPOAlignmentConfig(BaseModel): """Configuration for Direct Preference Optimization (DPO) alignment. - :param reward_scale: Scaling factor for the reward signal - :param reward_clip: Maximum absolute value for reward clipping - :param epsilon: Small value added for numerical stability - :param gamma: Discount factor for future rewards :param beta: Temperature parameter for the DPO loss :param loss_type: The type of loss function to use for DPO """ - reward_scale: float - reward_clip: float - epsilon: float - gamma: float beta: float loss_type: DPOLossType = DPOLossType.sigmoid diff --git a/tests/integration/post_training/test_post_training.py b/tests/integration/post_training/test_post_training.py index 839b9b1f2..002da1160 100644 --- a/tests/integration/post_training/test_post_training.py +++ b/tests/integration/post_training/test_post_training.py @@ -195,10 +195,6 @@ class TestPostTraining: algorithm_config = DPOAlignmentConfig( beta=0.1, loss_type=DPOLossType.sigmoid, # Default loss type - reward_scale=1.0, # Scaling factor for reward signal (neutral scaling) - reward_clip=5.0, # Maximum absolute value for reward clipping (prevents extreme values) - epsilon=1e-8, # Small value for numerical stability - gamma=1.0, ) data_config = DataConfig( dataset_id=dataset.identifier,