diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 6a8945bd1..f9af10165 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -15078,22 +15078,6 @@
"DPOAlignmentConfig": {
"type": "object",
"properties": {
- "reward_scale": {
- "type": "number",
- "description": "Scaling factor for the reward signal"
- },
- "reward_clip": {
- "type": "number",
- "description": "Maximum absolute value for reward clipping"
- },
- "epsilon": {
- "type": "number",
- "description": "Small value added for numerical stability"
- },
- "gamma": {
- "type": "number",
- "description": "Discount factor for future rewards"
- },
"beta": {
"type": "number",
"description": "Temperature parameter for the DPO loss"
@@ -15106,10 +15090,6 @@
},
"additionalProperties": false,
"required": [
- "reward_scale",
- "reward_clip",
- "epsilon",
- "gamma",
"beta",
"loss_type"
],
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index f1bb40dc1..d2c41b2bf 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -11163,20 +11163,6 @@ components:
DPOAlignmentConfig:
type: object
properties:
- reward_scale:
- type: number
- description: Scaling factor for the reward signal
- reward_clip:
- type: number
- description: >-
- Maximum absolute value for reward clipping
- epsilon:
- type: number
- description: >-
- Small value added for numerical stability
- gamma:
- type: number
- description: Discount factor for future rewards
beta:
type: number
description: Temperature parameter for the DPO loss
@@ -11186,10 +11172,6 @@ components:
description: The type of loss function to use for DPO
additionalProperties: false
required:
- - reward_scale
- - reward_clip
- - epsilon
- - gamma
- beta
- loss_type
title: DPOAlignmentConfig
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index 9170cba51..c16221289 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -193,18 +193,10 @@ class DPOLossType(Enum):
class DPOAlignmentConfig(BaseModel):
"""Configuration for Direct Preference Optimization (DPO) alignment.
- :param reward_scale: Scaling factor for the reward signal
- :param reward_clip: Maximum absolute value for reward clipping
- :param epsilon: Small value added for numerical stability
- :param gamma: Discount factor for future rewards
:param beta: Temperature parameter for the DPO loss
:param loss_type: The type of loss function to use for DPO
"""
- reward_scale: float
- reward_clip: float
- epsilon: float
- gamma: float
beta: float
loss_type: DPOLossType = DPOLossType.sigmoid
diff --git a/tests/integration/post_training/test_post_training.py b/tests/integration/post_training/test_post_training.py
index 839b9b1f2..002da1160 100644
--- a/tests/integration/post_training/test_post_training.py
+++ b/tests/integration/post_training/test_post_training.py
@@ -195,10 +195,6 @@ class TestPostTraining:
algorithm_config = DPOAlignmentConfig(
beta=0.1,
loss_type=DPOLossType.sigmoid, # Default loss type
- reward_scale=1.0, # Scaling factor for reward signal (neutral scaling)
- reward_clip=5.0, # Maximum absolute value for reward clipping (prevents extreme values)
- epsilon=1e-8, # Small value for numerical stability
- gamma=1.0,
)
data_config = DataConfig(
dataset_id=dataset.identifier,