fix: remove unused DPO parameters from schema and tests (#2988)

# What does this PR do? I removed these DPO parameters from the schema in [this PR](https://github.com/meta-llama/llama-stack/pull/2804), but I may not have done it correctly, since they were reintroduced in [this commit](cb7354a9ce (diff-4e9a8cb358213d6118c4b6ec2a76d0367af06441bf0717e13a775ade75e2061dR15081))—likely due to a pre-commit hook. I've made the changes again, and the pre-commit hook automatically updated the spec sheet.
2025-12-03 09:53:45 +00:00 · 2025-07-31 12:11:08 -04:00 · 2025-07-31 12:11:08 -04:00 · 3a574ef23c
commit 3a574ef23c
parent 5c33bc1353
4 changed files with 0 additions and 50 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -15078,22 +15078,6 @@
            "DPOAlignmentConfig": {
                "type": "object",
                "properties": {
-                    "reward_scale": {
-                        "type": "number",
-                        "description": "Scaling factor for the reward signal"
-                    },
-                    "reward_clip": {
-                        "type": "number",
-                        "description": "Maximum absolute value for reward clipping"
-                    },
-                    "epsilon": {
-                        "type": "number",
-                        "description": "Small value added for numerical stability"
-                    },
-                    "gamma": {
-                        "type": "number",
-                        "description": "Discount factor for future rewards"
-                    },
                    "beta": {
                        "type": "number",
                        "description": "Temperature parameter for the DPO loss"
@ -15106,10 +15090,6 @@
                },
                "additionalProperties": false,
                "required": [
-                    "reward_scale",
-                    "reward_clip",
-                    "epsilon",
-                    "gamma",
                    "beta",
                    "loss_type"
                ],
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -11163,20 +11163,6 @@ components:
    DPOAlignmentConfig:
      type: object
      properties:
-        reward_scale:
-          type: number
-          description: Scaling factor for the reward signal
-        reward_clip:
-          type: number
-          description: >-
-            Maximum absolute value for reward clipping
-        epsilon:
-          type: number
-          description: >-
-            Small value added for numerical stability
-        gamma:
-          type: number
-          description: Discount factor for future rewards
        beta:
          type: number
          description: Temperature parameter for the DPO loss
@ -11186,10 +11172,6 @@ components:
          description: The type of loss function to use for DPO
      additionalProperties: false
      required:
-        - reward_scale
-        - reward_clip
-        - epsilon
-        - gamma
        - beta
        - loss_type
      title: DPOAlignmentConfig
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -193,18 +193,10 @@ class DPOLossType(Enum):
 class DPOAlignmentConfig(BaseModel):
    """Configuration for Direct Preference Optimization (DPO) alignment.

-    :param reward_scale: Scaling factor for the reward signal
-    :param reward_clip: Maximum absolute value for reward clipping
-    :param epsilon: Small value added for numerical stability
-    :param gamma: Discount factor for future rewards
    :param beta: Temperature parameter for the DPO loss
    :param loss_type: The type of loss function to use for DPO
    """

-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
    beta: float
    loss_type: DPOLossType = DPOLossType.sigmoid

--- a/tests/integration/post_training/test_post_training.py
+++ b/tests/integration/post_training/test_post_training.py
@ -195,10 +195,6 @@ class TestPostTraining:
        algorithm_config = DPOAlignmentConfig(
            beta=0.1,
            loss_type=DPOLossType.sigmoid,  # Default loss type
-            reward_scale=1.0,  # Scaling factor for reward signal (neutral scaling)
-            reward_clip=5.0,  # Maximum absolute value for reward clipping (prevents extreme values)
-            epsilon=1e-8,  # Small value for numerical stability
-            gamma=1.0,
        )
        data_config = DataConfig(
            dataset_id=dataset.identifier,