fix: remove unused DPO parameters from schema and tests (#2988)

# What does this PR do?

I removed these DPO parameters from the schema in [this
PR](https://github.com/meta-llama/llama-stack/pull/2804), but I may not
have done it correctly, since they were reintroduced in [this
commit](cb7354a9ce (diff-4e9a8cb358213d6118c4b6ec2a76d0367af06441bf0717e13a775ade75e2061dR15081))—likely
due to a pre-commit hook.

I've made the changes again, and the pre-commit hook automatically
updated the spec sheet.
This commit is contained in:
Nehanth Narendrula 2025-07-31 12:11:08 -04:00 committed by GitHub
parent 5c33bc1353
commit 3a574ef23c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 0 additions and 50 deletions

View file

@ -15078,22 +15078,6 @@
"DPOAlignmentConfig": {
"type": "object",
"properties": {
"reward_scale": {
"type": "number",
"description": "Scaling factor for the reward signal"
},
"reward_clip": {
"type": "number",
"description": "Maximum absolute value for reward clipping"
},
"epsilon": {
"type": "number",
"description": "Small value added for numerical stability"
},
"gamma": {
"type": "number",
"description": "Discount factor for future rewards"
},
"beta": {
"type": "number",
"description": "Temperature parameter for the DPO loss"
@ -15106,10 +15090,6 @@
},
"additionalProperties": false,
"required": [
"reward_scale",
"reward_clip",
"epsilon",
"gamma",
"beta",
"loss_type"
],

View file

@ -11163,20 +11163,6 @@ components:
DPOAlignmentConfig:
type: object
properties:
reward_scale:
type: number
description: Scaling factor for the reward signal
reward_clip:
type: number
description: >-
Maximum absolute value for reward clipping
epsilon:
type: number
description: >-
Small value added for numerical stability
gamma:
type: number
description: Discount factor for future rewards
beta:
type: number
description: Temperature parameter for the DPO loss
@ -11186,10 +11172,6 @@ components:
description: The type of loss function to use for DPO
additionalProperties: false
required:
- reward_scale
- reward_clip
- epsilon
- gamma
- beta
- loss_type
title: DPOAlignmentConfig

View file

@ -193,18 +193,10 @@ class DPOLossType(Enum):
class DPOAlignmentConfig(BaseModel):
"""Configuration for Direct Preference Optimization (DPO) alignment.
:param reward_scale: Scaling factor for the reward signal
:param reward_clip: Maximum absolute value for reward clipping
:param epsilon: Small value added for numerical stability
:param gamma: Discount factor for future rewards
:param beta: Temperature parameter for the DPO loss
:param loss_type: The type of loss function to use for DPO
"""
reward_scale: float
reward_clip: float
epsilon: float
gamma: float
beta: float
loss_type: DPOLossType = DPOLossType.sigmoid

View file

@ -195,10 +195,6 @@ class TestPostTraining:
algorithm_config = DPOAlignmentConfig(
beta=0.1,
loss_type=DPOLossType.sigmoid, # Default loss type
reward_scale=1.0, # Scaling factor for reward signal (neutral scaling)
reward_clip=5.0, # Maximum absolute value for reward clipping (prevents extreme values)
epsilon=1e-8, # Small value for numerical stability
gamma=1.0,
)
data_config = DataConfig(
dataset_id=dataset.identifier,