updates to synth data apis

This commit is contained in:
Hardik Shah 2024-06-26 16:48:52 -07:00
parent c9a75c4628
commit 157e5ddf2e
2 changed files with 123 additions and 75 deletions

View file

@ -0,0 +1,58 @@
# Synthetic Data Generation API
== Schema ==
FilteringFunction:
name: str
params: json
SyntheticDataPoint:
custom_id: str
index: int
prompt: List[Message]
response: Message
logprob: float
score: float
SyntheticDataGenerationJob:
job_id: str # id provided by the api
created: string # format - date-time
status: string # enum (validating, running, completed, failed)
input_file_path: Path # jsonl style file where each row contains custom_id and message_list
success_file_path: Path # jsonl each line is SyntheticDataPoint
error_file_path: Path # custom_ids where we failed with some info
metadata: json
== Callsites ==
callsite:
/synthetic_data_gen/submit_job
request_type:
post
description:
Submit a job to generate synthetic data using llm + reward model scoring + filtering
request:
# batch inference params
model: str
prompt_file_path: Path # jsonl style file where each line is a json encoded List[Message] + custom_id
options: Options
num_generations: int
# reward model scoring params
reward_model: str
scoring_function: ScoringFunction
# filtering params
filtering_function: FilteringFunction
metadata: json
response:
synth_data_gen_job: SyntheticDataGenerationJob
callsite:
/synthetic_data_gen/job_status
request_type:
get
description:
Get status for an already submitted job
request:
job_id: str # unique identifier for the job
response:
synth_data_gen_job: SyntheticDataGenerationJob

View file

@ -1,12 +1,12 @@
openapi: 3.0.0 openapi: 3.0.0
info: info:
title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring. title: Synthetic Data Generation API
version: 0.0.1 version: 0.0.1
paths: paths:
/synthetic_data_generation/submit_job: /synthetic_data_gen/submit_job:
post: post:
summary: Submit a job for synthetic data generation. summary: Submit a job to generate synthetic data
description: Batch Inference > Reward Scoring > Filtering > Response description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
requestBody: requestBody:
required: true required: true
content: content:
@ -14,69 +14,47 @@ paths:
schema: schema:
type: object type: object
properties: properties:
# batch inference params
model: model:
type: string type: string
description: Model identifier for batch inference. description: Model used for batch inference
prompts_path: prompt_file_path:
type: string type: string
description: Path to prompts, JSONL for batch inference format: path
batch_size: description: Path to the JSONL file containing message_lists and custom IDs
type: integer options:
description: Number of prompts to process in each batch. $ref: '#/components/schemas/Options'
# TODO: May-be put all these generation related params in a struct
temperature:
type: number
format: float
description: Temperature parameter for generation.
top_p:
type: number
format: float
description: Top-p parameter for generation.
max_gen_len:
type: integer
description: Maximum length of generated responses.
num_generations: num_generations:
type: integer type: integer
description: Number of generations per prompt. description: Number of generations to produce
# reward model scoring params
reward_model: reward_model:
type: string type: string
description: Identifier for the reward model used for scoring. description: Model used for scoring
scoring_function: scoring_function:
type: string $ref: '#/components/schemas/ScoringFunction'
description: Scoring function to apply.
# params for filtering responses
# filtering function will have a signature as
# def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
filtering_function: filtering_function:
$ref: '#/components/schemas/FilteringFunction'
metadata:
type: object type: object
properties: additionalProperties: true
name: description: Additional metadata for the job
type: string
description: Name of the filtering function, can be a simple threshold or a pre-registered function.
params:
type: object
additionalProperties: true
description: JSON object containing parameters for the filtering function.
responses: responses:
'200': '200':
description: Job successfully created and processing. description: Job successfully submitted
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/SyntheticDataGeneration' $ref: '#/components/schemas/SyntheticDataGenerationJob'
/synthetic_data_generation/job_status: /synthetic_data_gen/job_status:
get: get:
summary: Get the status of a submitted job summary: Get job status
description: Get the status of a submitted job description: Get status for an already submitted job
parameters: parameters:
- in: query - in: query
name: job_id name: job_id
required: true
schema: schema:
type: string type: string
required: true
description: Unique identifier for the job description: Unique identifier for the job
responses: responses:
'200': '200':
@ -84,58 +62,70 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/SyntheticDataGeneration' $ref: '#/components/schemas/SyntheticDataGenerationJob'
'400':
description: Invalid job ID provided
'404':
description: Job not found
components: components:
schemas: schemas:
PromptResponseScore: FilteringFunction:
type: object type: object
properties: properties:
id: name:
type: string type: string
description: Carry forwarded from the user provided id from prompt. description: Name of the filtering function
params:
type: object
additionalProperties: true
description: JSON object containing parameters for the filtering function
SyntheticDataPoint:
type: object
properties:
custom_id:
type: string
description: Custom identifier for the data point
index: index:
type: integer type: integer
description: Index of the generation. description: Index of the data point
prompt: prompt:
type: array type: array
items: items:
$ref: '#/components/schemas/Message' $ref: '#/components/schemas/Message'
description: List of messages used as prompt
response: response:
$ref: '#/components/schemas/Completion' $ref: '#/components/schemas/Message'
logprob:
type: number
format: float
description: Log probability of the response
score: score:
type: number type: number
format: float format: float
description: Final score after filtering. description: Score of the response based on the reward model
raw_score: SyntheticDataGenerationJob:
type: number
format: float
description: Raw score from the reward model.
SyntheticDataGeneration:
type: object type: object
properties: properties:
job_id: job_id:
type: string type: string
description: Unique identifier for the job. description: ID provided by the API
created: created:
type: string type: string
format: date-time format: date-time
description: Timestamp when the job was created. description: Timestamp when the job was created
status: status:
type: string type: string
description: Current status of the job, can indicate the stage or success/failure. enum: [validating, running, completed, failed]
output_file_path: description: Current status of the job
input_file_path:
type: string type: string
description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object. format: path
Message: description: Path to the input JSONL file
type: object success_file_path:
properties: type: string
# As Defined in /batch_inference format: path
Completion: description: Path to the JSONL file containing successful results
type: object error_file_path:
properties: type: string
# As Defined in /batch_inference format: path
description: Path to the JSONL file containing errors
metadata:
type: object
additionalProperties: true
description: Additional metadata about the job