mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-16 09:58:10 +00:00
updates to synth data apis
This commit is contained in:
parent
c9a75c4628
commit
157e5ddf2e
2 changed files with 123 additions and 75 deletions
58
simple_view/synthetic_data_generation.yml
Normal file
58
simple_view/synthetic_data_generation.yml
Normal file
|
@ -0,0 +1,58 @@
|
|||
# Synthetic Data Generation API
|
||||
== Schema ==
|
||||
|
||||
FilteringFunction:
|
||||
name: str
|
||||
params: json
|
||||
|
||||
SyntheticDataPoint:
|
||||
custom_id: str
|
||||
index: int
|
||||
prompt: List[Message]
|
||||
response: Message
|
||||
logprob: float
|
||||
score: float
|
||||
|
||||
SyntheticDataGenerationJob:
|
||||
job_id: str # id provided by the api
|
||||
created: string # format - date-time
|
||||
status: string # enum (validating, running, completed, failed)
|
||||
input_file_path: Path # jsonl style file where each row contains custom_id and message_list
|
||||
success_file_path: Path # jsonl each line is SyntheticDataPoint
|
||||
error_file_path: Path # custom_ids where we failed with some info
|
||||
metadata: json
|
||||
|
||||
== Callsites ==
|
||||
|
||||
callsite:
|
||||
/synthetic_data_gen/submit_job
|
||||
request_type:
|
||||
post
|
||||
description:
|
||||
Submit a job to generate synthetic data using llm + reward model scoring + filtering
|
||||
request:
|
||||
# batch inference params
|
||||
model: str
|
||||
prompt_file_path: Path # jsonl style file where each line is a json encoded List[Message] + custom_id
|
||||
options: Options
|
||||
num_generations: int
|
||||
# reward model scoring params
|
||||
reward_model: str
|
||||
scoring_function: ScoringFunction
|
||||
# filtering params
|
||||
filtering_function: FilteringFunction
|
||||
metadata: json
|
||||
|
||||
response:
|
||||
synth_data_gen_job: SyntheticDataGenerationJob
|
||||
|
||||
callsite:
|
||||
/synthetic_data_gen/job_status
|
||||
request_type:
|
||||
get
|
||||
description:
|
||||
Get status for an already submitted job
|
||||
request:
|
||||
job_id: str # unique identifier for the job
|
||||
response:
|
||||
synth_data_gen_job: SyntheticDataGenerationJob
|
|
@ -1,12 +1,12 @@
|
|||
openapi: 3.0.0
|
||||
info:
|
||||
title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring.
|
||||
title: Synthetic Data Generation API
|
||||
version: 0.0.1
|
||||
paths:
|
||||
/synthetic_data_generation/submit_job:
|
||||
/synthetic_data_gen/submit_job:
|
||||
post:
|
||||
summary: Submit a job for synthetic data generation.
|
||||
description: Batch Inference > Reward Scoring > Filtering > Response
|
||||
summary: Submit a job to generate synthetic data
|
||||
description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
|
@ -14,69 +14,47 @@ paths:
|
|||
schema:
|
||||
type: object
|
||||
properties:
|
||||
# batch inference params
|
||||
model:
|
||||
type: string
|
||||
description: Model identifier for batch inference.
|
||||
prompts_path:
|
||||
description: Model used for batch inference
|
||||
prompt_file_path:
|
||||
type: string
|
||||
description: Path to prompts, JSONL for batch inference
|
||||
batch_size:
|
||||
type: integer
|
||||
description: Number of prompts to process in each batch.
|
||||
# TODO: May-be put all these generation related params in a struct
|
||||
temperature:
|
||||
type: number
|
||||
format: float
|
||||
description: Temperature parameter for generation.
|
||||
top_p:
|
||||
type: number
|
||||
format: float
|
||||
description: Top-p parameter for generation.
|
||||
max_gen_len:
|
||||
type: integer
|
||||
description: Maximum length of generated responses.
|
||||
format: path
|
||||
description: Path to the JSONL file containing message_lists and custom IDs
|
||||
options:
|
||||
$ref: '#/components/schemas/Options'
|
||||
num_generations:
|
||||
type: integer
|
||||
description: Number of generations per prompt.
|
||||
# reward model scoring params
|
||||
description: Number of generations to produce
|
||||
reward_model:
|
||||
type: string
|
||||
description: Identifier for the reward model used for scoring.
|
||||
description: Model used for scoring
|
||||
scoring_function:
|
||||
type: string
|
||||
description: Scoring function to apply.
|
||||
# params for filtering responses
|
||||
# filtering function will have a signature as
|
||||
# def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
|
||||
$ref: '#/components/schemas/ScoringFunction'
|
||||
filtering_function:
|
||||
$ref: '#/components/schemas/FilteringFunction'
|
||||
metadata:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
description: Name of the filtering function, can be a simple threshold or a pre-registered function.
|
||||
params:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
description: JSON object containing parameters for the filtering function.
|
||||
additionalProperties: true
|
||||
description: Additional metadata for the job
|
||||
responses:
|
||||
'200':
|
||||
description: Job successfully created and processing.
|
||||
description: Job successfully submitted
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/SyntheticDataGeneration'
|
||||
$ref: '#/components/schemas/SyntheticDataGenerationJob'
|
||||
|
||||
/synthetic_data_generation/job_status:
|
||||
/synthetic_data_gen/job_status:
|
||||
get:
|
||||
summary: Get the status of a submitted job
|
||||
description: Get the status of a submitted job
|
||||
summary: Get job status
|
||||
description: Get status for an already submitted job
|
||||
parameters:
|
||||
- in: query
|
||||
name: job_id
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
required: true
|
||||
description: Unique identifier for the job
|
||||
responses:
|
||||
'200':
|
||||
|
@ -84,58 +62,70 @@ paths:
|
|||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/SyntheticDataGeneration'
|
||||
'400':
|
||||
description: Invalid job ID provided
|
||||
'404':
|
||||
description: Job not found
|
||||
|
||||
$ref: '#/components/schemas/SyntheticDataGenerationJob'
|
||||
components:
|
||||
schemas:
|
||||
PromptResponseScore:
|
||||
FilteringFunction:
|
||||
type: object
|
||||
properties:
|
||||
id:
|
||||
name:
|
||||
type: string
|
||||
description: Carry forwarded from the user provided id from prompt.
|
||||
description: Name of the filtering function
|
||||
params:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
description: JSON object containing parameters for the filtering function
|
||||
SyntheticDataPoint:
|
||||
type: object
|
||||
properties:
|
||||
custom_id:
|
||||
type: string
|
||||
description: Custom identifier for the data point
|
||||
index:
|
||||
type: integer
|
||||
description: Index of the generation.
|
||||
description: Index of the data point
|
||||
prompt:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/Message'
|
||||
description: List of messages used as prompt
|
||||
response:
|
||||
$ref: '#/components/schemas/Completion'
|
||||
$ref: '#/components/schemas/Message'
|
||||
logprob:
|
||||
type: number
|
||||
format: float
|
||||
description: Log probability of the response
|
||||
score:
|
||||
type: number
|
||||
format: float
|
||||
description: Final score after filtering.
|
||||
raw_score:
|
||||
type: number
|
||||
format: float
|
||||
description: Raw score from the reward model.
|
||||
SyntheticDataGeneration:
|
||||
description: Score of the response based on the reward model
|
||||
SyntheticDataGenerationJob:
|
||||
type: object
|
||||
properties:
|
||||
job_id:
|
||||
type: string
|
||||
description: Unique identifier for the job.
|
||||
description: ID provided by the API
|
||||
created:
|
||||
type: string
|
||||
format: date-time
|
||||
description: Timestamp when the job was created.
|
||||
description: Timestamp when the job was created
|
||||
status:
|
||||
type: string
|
||||
description: Current status of the job, can indicate the stage or success/failure.
|
||||
output_file_path:
|
||||
enum: [validating, running, completed, failed]
|
||||
description: Current status of the job
|
||||
input_file_path:
|
||||
type: string
|
||||
description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object.
|
||||
Message:
|
||||
type: object
|
||||
properties:
|
||||
# As Defined in /batch_inference
|
||||
Completion:
|
||||
type: object
|
||||
properties:
|
||||
# As Defined in /batch_inference
|
||||
format: path
|
||||
description: Path to the input JSONL file
|
||||
success_file_path:
|
||||
type: string
|
||||
format: path
|
||||
description: Path to the JSONL file containing successful results
|
||||
error_file_path:
|
||||
type: string
|
||||
format: path
|
||||
description: Path to the JSONL file containing errors
|
||||
metadata:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
description: Additional metadata about the job
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue