mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-16 09:58:10 +00:00
updates to synth data apis
This commit is contained in:
parent
c9a75c4628
commit
157e5ddf2e
2 changed files with 123 additions and 75 deletions
58
simple_view/synthetic_data_generation.yml
Normal file
58
simple_view/synthetic_data_generation.yml
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# Synthetic Data Generation API
|
||||||
|
== Schema ==
|
||||||
|
|
||||||
|
FilteringFunction:
|
||||||
|
name: str
|
||||||
|
params: json
|
||||||
|
|
||||||
|
SyntheticDataPoint:
|
||||||
|
custom_id: str
|
||||||
|
index: int
|
||||||
|
prompt: List[Message]
|
||||||
|
response: Message
|
||||||
|
logprob: float
|
||||||
|
score: float
|
||||||
|
|
||||||
|
SyntheticDataGenerationJob:
|
||||||
|
job_id: str # id provided by the api
|
||||||
|
created: string # format - date-time
|
||||||
|
status: string # enum (validating, running, completed, failed)
|
||||||
|
input_file_path: Path # jsonl style file where each row contains custom_id and message_list
|
||||||
|
success_file_path: Path # jsonl each line is SyntheticDataPoint
|
||||||
|
error_file_path: Path # custom_ids where we failed with some info
|
||||||
|
metadata: json
|
||||||
|
|
||||||
|
== Callsites ==
|
||||||
|
|
||||||
|
callsite:
|
||||||
|
/synthetic_data_gen/submit_job
|
||||||
|
request_type:
|
||||||
|
post
|
||||||
|
description:
|
||||||
|
Submit a job to generate synthetic data using llm + reward model scoring + filtering
|
||||||
|
request:
|
||||||
|
# batch inference params
|
||||||
|
model: str
|
||||||
|
prompt_file_path: Path # jsonl style file where each line is a json encoded List[Message] + custom_id
|
||||||
|
options: Options
|
||||||
|
num_generations: int
|
||||||
|
# reward model scoring params
|
||||||
|
reward_model: str
|
||||||
|
scoring_function: ScoringFunction
|
||||||
|
# filtering params
|
||||||
|
filtering_function: FilteringFunction
|
||||||
|
metadata: json
|
||||||
|
|
||||||
|
response:
|
||||||
|
synth_data_gen_job: SyntheticDataGenerationJob
|
||||||
|
|
||||||
|
callsite:
|
||||||
|
/synthetic_data_gen/job_status
|
||||||
|
request_type:
|
||||||
|
get
|
||||||
|
description:
|
||||||
|
Get status for an already submitted job
|
||||||
|
request:
|
||||||
|
job_id: str # unique identifier for the job
|
||||||
|
response:
|
||||||
|
synth_data_gen_job: SyntheticDataGenerationJob
|
|
@ -1,12 +1,12 @@
|
||||||
openapi: 3.0.0
|
openapi: 3.0.0
|
||||||
info:
|
info:
|
||||||
title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring.
|
title: Synthetic Data Generation API
|
||||||
version: 0.0.1
|
version: 0.0.1
|
||||||
paths:
|
paths:
|
||||||
/synthetic_data_generation/submit_job:
|
/synthetic_data_gen/submit_job:
|
||||||
post:
|
post:
|
||||||
summary: Submit a job for synthetic data generation.
|
summary: Submit a job to generate synthetic data
|
||||||
description: Batch Inference > Reward Scoring > Filtering > Response
|
description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
|
||||||
requestBody:
|
requestBody:
|
||||||
required: true
|
required: true
|
||||||
content:
|
content:
|
||||||
|
@ -14,69 +14,47 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
# batch inference params
|
|
||||||
model:
|
model:
|
||||||
type: string
|
type: string
|
||||||
description: Model identifier for batch inference.
|
description: Model used for batch inference
|
||||||
prompts_path:
|
prompt_file_path:
|
||||||
type: string
|
type: string
|
||||||
description: Path to prompts, JSONL for batch inference
|
format: path
|
||||||
batch_size:
|
description: Path to the JSONL file containing message_lists and custom IDs
|
||||||
type: integer
|
options:
|
||||||
description: Number of prompts to process in each batch.
|
$ref: '#/components/schemas/Options'
|
||||||
# TODO: May-be put all these generation related params in a struct
|
|
||||||
temperature:
|
|
||||||
type: number
|
|
||||||
format: float
|
|
||||||
description: Temperature parameter for generation.
|
|
||||||
top_p:
|
|
||||||
type: number
|
|
||||||
format: float
|
|
||||||
description: Top-p parameter for generation.
|
|
||||||
max_gen_len:
|
|
||||||
type: integer
|
|
||||||
description: Maximum length of generated responses.
|
|
||||||
num_generations:
|
num_generations:
|
||||||
type: integer
|
type: integer
|
||||||
description: Number of generations per prompt.
|
description: Number of generations to produce
|
||||||
# reward model scoring params
|
|
||||||
reward_model:
|
reward_model:
|
||||||
type: string
|
type: string
|
||||||
description: Identifier for the reward model used for scoring.
|
description: Model used for scoring
|
||||||
scoring_function:
|
scoring_function:
|
||||||
type: string
|
$ref: '#/components/schemas/ScoringFunction'
|
||||||
description: Scoring function to apply.
|
|
||||||
# params for filtering responses
|
|
||||||
# filtering function will have a signature as
|
|
||||||
# def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
|
|
||||||
filtering_function:
|
filtering_function:
|
||||||
|
$ref: '#/components/schemas/FilteringFunction'
|
||||||
|
metadata:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
additionalProperties: true
|
||||||
name:
|
description: Additional metadata for the job
|
||||||
type: string
|
|
||||||
description: Name of the filtering function, can be a simple threshold or a pre-registered function.
|
|
||||||
params:
|
|
||||||
type: object
|
|
||||||
additionalProperties: true
|
|
||||||
description: JSON object containing parameters for the filtering function.
|
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
description: Job successfully created and processing.
|
description: Job successfully submitted
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/SyntheticDataGeneration'
|
$ref: '#/components/schemas/SyntheticDataGenerationJob'
|
||||||
|
|
||||||
/synthetic_data_generation/job_status:
|
/synthetic_data_gen/job_status:
|
||||||
get:
|
get:
|
||||||
summary: Get the status of a submitted job
|
summary: Get job status
|
||||||
description: Get the status of a submitted job
|
description: Get status for an already submitted job
|
||||||
parameters:
|
parameters:
|
||||||
- in: query
|
- in: query
|
||||||
name: job_id
|
name: job_id
|
||||||
required: true
|
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
|
required: true
|
||||||
description: Unique identifier for the job
|
description: Unique identifier for the job
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -84,58 +62,70 @@ paths:
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/SyntheticDataGeneration'
|
$ref: '#/components/schemas/SyntheticDataGenerationJob'
|
||||||
'400':
|
|
||||||
description: Invalid job ID provided
|
|
||||||
'404':
|
|
||||||
description: Job not found
|
|
||||||
|
|
||||||
components:
|
components:
|
||||||
schemas:
|
schemas:
|
||||||
PromptResponseScore:
|
FilteringFunction:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
id:
|
name:
|
||||||
type: string
|
type: string
|
||||||
description: Carry forwarded from the user provided id from prompt.
|
description: Name of the filtering function
|
||||||
|
params:
|
||||||
|
type: object
|
||||||
|
additionalProperties: true
|
||||||
|
description: JSON object containing parameters for the filtering function
|
||||||
|
SyntheticDataPoint:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
custom_id:
|
||||||
|
type: string
|
||||||
|
description: Custom identifier for the data point
|
||||||
index:
|
index:
|
||||||
type: integer
|
type: integer
|
||||||
description: Index of the generation.
|
description: Index of the data point
|
||||||
prompt:
|
prompt:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
$ref: '#/components/schemas/Message'
|
$ref: '#/components/schemas/Message'
|
||||||
|
description: List of messages used as prompt
|
||||||
response:
|
response:
|
||||||
$ref: '#/components/schemas/Completion'
|
$ref: '#/components/schemas/Message'
|
||||||
|
logprob:
|
||||||
|
type: number
|
||||||
|
format: float
|
||||||
|
description: Log probability of the response
|
||||||
score:
|
score:
|
||||||
type: number
|
type: number
|
||||||
format: float
|
format: float
|
||||||
description: Final score after filtering.
|
description: Score of the response based on the reward model
|
||||||
raw_score:
|
SyntheticDataGenerationJob:
|
||||||
type: number
|
|
||||||
format: float
|
|
||||||
description: Raw score from the reward model.
|
|
||||||
SyntheticDataGeneration:
|
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
job_id:
|
job_id:
|
||||||
type: string
|
type: string
|
||||||
description: Unique identifier for the job.
|
description: ID provided by the API
|
||||||
created:
|
created:
|
||||||
type: string
|
type: string
|
||||||
format: date-time
|
format: date-time
|
||||||
description: Timestamp when the job was created.
|
description: Timestamp when the job was created
|
||||||
status:
|
status:
|
||||||
type: string
|
type: string
|
||||||
description: Current status of the job, can indicate the stage or success/failure.
|
enum: [validating, running, completed, failed]
|
||||||
output_file_path:
|
description: Current status of the job
|
||||||
|
input_file_path:
|
||||||
type: string
|
type: string
|
||||||
description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object.
|
format: path
|
||||||
Message:
|
description: Path to the input JSONL file
|
||||||
type: object
|
success_file_path:
|
||||||
properties:
|
type: string
|
||||||
# As Defined in /batch_inference
|
format: path
|
||||||
Completion:
|
description: Path to the JSONL file containing successful results
|
||||||
type: object
|
error_file_path:
|
||||||
properties:
|
type: string
|
||||||
# As Defined in /batch_inference
|
format: path
|
||||||
|
description: Path to the JSONL file containing errors
|
||||||
|
metadata:
|
||||||
|
type: object
|
||||||
|
additionalProperties: true
|
||||||
|
description: Additional metadata about the job
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue