updates to synth data apis

This commit is contained in:
Hardik Shah 2024-06-26 16:48:52 -07:00
parent c9a75c4628
commit 157e5ddf2e
2 changed files with 123 additions and 75 deletions

View file

@ -0,0 +1,58 @@
# Synthetic Data Generation API
== Schema ==
FilteringFunction:
name: str
params: json
SyntheticDataPoint:
custom_id: str
index: int
prompt: List[Message]
response: Message
logprob: float
score: float
SyntheticDataGenerationJob:
job_id: str # id provided by the api
created: string # format - date-time
status: string # enum (validating, running, completed, failed)
input_file_path: Path # jsonl style file where each row contains custom_id and message_list
success_file_path: Path # jsonl each line is SyntheticDataPoint
error_file_path: Path # custom_ids where we failed with some info
metadata: json
== Callsites ==
callsite:
/synthetic_data_gen/submit_job
request_type:
post
description:
Submit a job to generate synthetic data using llm + reward model scoring + filtering
request:
# batch inference params
model: str
prompt_file_path: Path # jsonl style file where each line is a json encoded List[Message] + custom_id
options: Options
num_generations: int
# reward model scoring params
reward_model: str
scoring_function: ScoringFunction
# filtering params
filtering_function: FilteringFunction
metadata: json
response:
synth_data_gen_job: SyntheticDataGenerationJob
callsite:
/synthetic_data_gen/job_status
request_type:
get
description:
Get status for an already submitted job
request:
job_id: str # unique identifier for the job
response:
synth_data_gen_job: SyntheticDataGenerationJob

View file

@ -1,12 +1,12 @@
openapi: 3.0.0
info:
title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring.
title: Synthetic Data Generation API
version: 0.0.1
paths:
/synthetic_data_generation/submit_job:
/synthetic_data_gen/submit_job:
post:
summary: Submit a job for synthetic data generation.
description: Batch Inference > Reward Scoring > Filtering > Response
summary: Submit a job to generate synthetic data
description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
requestBody:
required: true
content:
@ -14,69 +14,47 @@ paths:
schema:
type: object
properties:
# batch inference params
model:
type: string
description: Model identifier for batch inference.
prompts_path:
description: Model used for batch inference
prompt_file_path:
type: string
description: Path to prompts, JSONL for batch inference
batch_size:
type: integer
description: Number of prompts to process in each batch.
# TODO: May-be put all these generation related params in a struct
temperature:
type: number
format: float
description: Temperature parameter for generation.
top_p:
type: number
format: float
description: Top-p parameter for generation.
max_gen_len:
type: integer
description: Maximum length of generated responses.
format: path
description: Path to the JSONL file containing message_lists and custom IDs
options:
$ref: '#/components/schemas/Options'
num_generations:
type: integer
description: Number of generations per prompt.
# reward model scoring params
description: Number of generations to produce
reward_model:
type: string
description: Identifier for the reward model used for scoring.
description: Model used for scoring
scoring_function:
type: string
description: Scoring function to apply.
# params for filtering responses
# filtering function will have a signature as
# def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
$ref: '#/components/schemas/ScoringFunction'
filtering_function:
type: object
properties:
name:
type: string
description: Name of the filtering function, can be a simple threshold or a pre-registered function.
params:
$ref: '#/components/schemas/FilteringFunction'
metadata:
type: object
additionalProperties: true
description: JSON object containing parameters for the filtering function.
description: Additional metadata for the job
responses:
'200':
description: Job successfully created and processing.
description: Job successfully submitted
content:
application/json:
schema:
$ref: '#/components/schemas/SyntheticDataGeneration'
$ref: '#/components/schemas/SyntheticDataGenerationJob'
/synthetic_data_generation/job_status:
/synthetic_data_gen/job_status:
get:
summary: Get the status of a submitted job
description: Get the status of a submitted job
summary: Get job status
description: Get status for an already submitted job
parameters:
- in: query
name: job_id
required: true
schema:
type: string
required: true
description: Unique identifier for the job
responses:
'200':
@ -84,58 +62,70 @@ paths:
content:
application/json:
schema:
$ref: '#/components/schemas/SyntheticDataGeneration'
'400':
description: Invalid job ID provided
'404':
description: Job not found
$ref: '#/components/schemas/SyntheticDataGenerationJob'
components:
schemas:
PromptResponseScore:
FilteringFunction:
type: object
properties:
id:
name:
type: string
description: Carry forwarded from the user provided id from prompt.
description: Name of the filtering function
params:
type: object
additionalProperties: true
description: JSON object containing parameters for the filtering function
SyntheticDataPoint:
type: object
properties:
custom_id:
type: string
description: Custom identifier for the data point
index:
type: integer
description: Index of the generation.
description: Index of the data point
prompt:
type: array
items:
$ref: '#/components/schemas/Message'
description: List of messages used as prompt
response:
$ref: '#/components/schemas/Completion'
$ref: '#/components/schemas/Message'
logprob:
type: number
format: float
description: Log probability of the response
score:
type: number
format: float
description: Final score after filtering.
raw_score:
type: number
format: float
description: Raw score from the reward model.
SyntheticDataGeneration:
description: Score of the response based on the reward model
SyntheticDataGenerationJob:
type: object
properties:
job_id:
type: string
description: Unique identifier for the job.
description: ID provided by the API
created:
type: string
format: date-time
description: Timestamp when the job was created.
description: Timestamp when the job was created
status:
type: string
description: Current status of the job, can indicate the stage or success/failure.
output_file_path:
enum: [validating, running, completed, failed]
description: Current status of the job
input_file_path:
type: string
description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object.
Message:
format: path
description: Path to the input JSONL file
success_file_path:
type: string
format: path
description: Path to the JSONL file containing successful results
error_file_path:
type: string
format: path
description: Path to the JSONL file containing errors
metadata:
type: object
properties:
# As Defined in /batch_inference
Completion:
type: object
properties:
# As Defined in /batch_inference
additionalProperties: true
description: Additional metadata about the job