mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-22 22:39:41 +00:00
Fix DPOAlignmentConfig schema to use correct DPO parameters
- Replace incorrect PPO-like parameters (reward_scale, reward_clip, epsilon, gamma) - Add proper DPO parameters: beta (KL coefficient) and loss_type - Update spec to reflect the correct schema
This commit is contained in:
parent
477bcd4d09
commit
37875a1985
2 changed files with 13 additions and 15 deletions
22
docs/_static/llama-stack-spec.yaml
vendored
22
docs/_static/llama-stack-spec.yaml
vendored
|
|
@ -10111,20 +10111,20 @@ components:
|
||||||
DPOAlignmentConfig:
|
DPOAlignmentConfig:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
reward_scale:
|
beta:
|
||||||
type: number
|
|
||||||
reward_clip:
|
|
||||||
type: number
|
|
||||||
epsilon:
|
|
||||||
type: number
|
|
||||||
gamma:
|
|
||||||
type: number
|
type: number
|
||||||
|
loss_type:
|
||||||
|
type: string
|
||||||
|
enum:
|
||||||
|
- sigmoid
|
||||||
|
- hinge
|
||||||
|
- ipo
|
||||||
|
- kto_pair
|
||||||
|
default: sigmoid
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- reward_scale
|
- beta
|
||||||
- reward_clip
|
- loss_type
|
||||||
- epsilon
|
|
||||||
- gamma
|
|
||||||
title: DPOAlignmentConfig
|
title: DPOAlignmentConfig
|
||||||
DataConfig:
|
DataConfig:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -106,10 +106,8 @@ class RLHFAlgorithm(Enum):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class DPOAlignmentConfig(BaseModel):
|
class DPOAlignmentConfig(BaseModel):
|
||||||
reward_scale: float
|
beta: float
|
||||||
reward_clip: float
|
loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid"
|
||||||
epsilon: float
|
|
||||||
gamma: float
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue