mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 04:50:39 +00:00
fix: remove unused DPO parameters from schema and tests (#2988)
# What does this PR do?
I removed these DPO parameters from the schema in [this
PR](https://github.com/meta-llama/llama-stack/pull/2804), but I may not
have done it correctly, since they were reintroduced in [this
commit](cb7354a9ce (diff-4e9a8cb358213d6118c4b6ec2a76d0367af06441bf0717e13a775ade75e2061dR15081)
)—likely
due to a pre-commit hook.
I've made the changes again, and the pre-commit hook automatically
updated the spec sheet.
This commit is contained in:
parent
5c33bc1353
commit
3a574ef23c
4 changed files with 0 additions and 50 deletions
20
docs/_static/llama-stack-spec.html
vendored
20
docs/_static/llama-stack-spec.html
vendored
|
@ -15078,22 +15078,6 @@
|
||||||
"DPOAlignmentConfig": {
|
"DPOAlignmentConfig": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"reward_scale": {
|
|
||||||
"type": "number",
|
|
||||||
"description": "Scaling factor for the reward signal"
|
|
||||||
},
|
|
||||||
"reward_clip": {
|
|
||||||
"type": "number",
|
|
||||||
"description": "Maximum absolute value for reward clipping"
|
|
||||||
},
|
|
||||||
"epsilon": {
|
|
||||||
"type": "number",
|
|
||||||
"description": "Small value added for numerical stability"
|
|
||||||
},
|
|
||||||
"gamma": {
|
|
||||||
"type": "number",
|
|
||||||
"description": "Discount factor for future rewards"
|
|
||||||
},
|
|
||||||
"beta": {
|
"beta": {
|
||||||
"type": "number",
|
"type": "number",
|
||||||
"description": "Temperature parameter for the DPO loss"
|
"description": "Temperature parameter for the DPO loss"
|
||||||
|
@ -15106,10 +15090,6 @@
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"reward_scale",
|
|
||||||
"reward_clip",
|
|
||||||
"epsilon",
|
|
||||||
"gamma",
|
|
||||||
"beta",
|
"beta",
|
||||||
"loss_type"
|
"loss_type"
|
||||||
],
|
],
|
||||||
|
|
18
docs/_static/llama-stack-spec.yaml
vendored
18
docs/_static/llama-stack-spec.yaml
vendored
|
@ -11163,20 +11163,6 @@ components:
|
||||||
DPOAlignmentConfig:
|
DPOAlignmentConfig:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
reward_scale:
|
|
||||||
type: number
|
|
||||||
description: Scaling factor for the reward signal
|
|
||||||
reward_clip:
|
|
||||||
type: number
|
|
||||||
description: >-
|
|
||||||
Maximum absolute value for reward clipping
|
|
||||||
epsilon:
|
|
||||||
type: number
|
|
||||||
description: >-
|
|
||||||
Small value added for numerical stability
|
|
||||||
gamma:
|
|
||||||
type: number
|
|
||||||
description: Discount factor for future rewards
|
|
||||||
beta:
|
beta:
|
||||||
type: number
|
type: number
|
||||||
description: Temperature parameter for the DPO loss
|
description: Temperature parameter for the DPO loss
|
||||||
|
@ -11186,10 +11172,6 @@ components:
|
||||||
description: The type of loss function to use for DPO
|
description: The type of loss function to use for DPO
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- reward_scale
|
|
||||||
- reward_clip
|
|
||||||
- epsilon
|
|
||||||
- gamma
|
|
||||||
- beta
|
- beta
|
||||||
- loss_type
|
- loss_type
|
||||||
title: DPOAlignmentConfig
|
title: DPOAlignmentConfig
|
||||||
|
|
|
@ -193,18 +193,10 @@ class DPOLossType(Enum):
|
||||||
class DPOAlignmentConfig(BaseModel):
|
class DPOAlignmentConfig(BaseModel):
|
||||||
"""Configuration for Direct Preference Optimization (DPO) alignment.
|
"""Configuration for Direct Preference Optimization (DPO) alignment.
|
||||||
|
|
||||||
:param reward_scale: Scaling factor for the reward signal
|
|
||||||
:param reward_clip: Maximum absolute value for reward clipping
|
|
||||||
:param epsilon: Small value added for numerical stability
|
|
||||||
:param gamma: Discount factor for future rewards
|
|
||||||
:param beta: Temperature parameter for the DPO loss
|
:param beta: Temperature parameter for the DPO loss
|
||||||
:param loss_type: The type of loss function to use for DPO
|
:param loss_type: The type of loss function to use for DPO
|
||||||
"""
|
"""
|
||||||
|
|
||||||
reward_scale: float
|
|
||||||
reward_clip: float
|
|
||||||
epsilon: float
|
|
||||||
gamma: float
|
|
||||||
beta: float
|
beta: float
|
||||||
loss_type: DPOLossType = DPOLossType.sigmoid
|
loss_type: DPOLossType = DPOLossType.sigmoid
|
||||||
|
|
||||||
|
|
|
@ -195,10 +195,6 @@ class TestPostTraining:
|
||||||
algorithm_config = DPOAlignmentConfig(
|
algorithm_config = DPOAlignmentConfig(
|
||||||
beta=0.1,
|
beta=0.1,
|
||||||
loss_type=DPOLossType.sigmoid, # Default loss type
|
loss_type=DPOLossType.sigmoid, # Default loss type
|
||||||
reward_scale=1.0, # Scaling factor for reward signal (neutral scaling)
|
|
||||||
reward_clip=5.0, # Maximum absolute value for reward clipping (prevents extreme values)
|
|
||||||
epsilon=1e-8, # Small value for numerical stability
|
|
||||||
gamma=1.0,
|
|
||||||
)
|
)
|
||||||
data_config = DataConfig(
|
data_config = DataConfig(
|
||||||
dataset_id=dataset.identifier,
|
dataset_id=dataset.identifier,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue