diff --git a/simple_view/synthetic_data_generation.yml b/simple_view/synthetic_data_generation.yml new file mode 100644 index 000000000..27c942c24 --- /dev/null +++ b/simple_view/synthetic_data_generation.yml @@ -0,0 +1,58 @@ +# Synthetic Data Generation API +== Schema == + +FilteringFunction: + name: str + params: json + +SyntheticDataPoint: + custom_id: str + index: int + prompt: List[Message] + response: Message + logprob: float + score: float + +SyntheticDataGenerationJob: + job_id: str # id provided by the api + created: string # format - date-time + status: string # enum (validating, running, completed, failed) + input_file_path: Path # jsonl style file where each row contains custom_id and message_list + success_file_path: Path # jsonl each line is SyntheticDataPoint + error_file_path: Path # custom_ids where we failed with some info + metadata: json + +== Callsites == + +callsite: + /synthetic_data_gen/submit_job +request_type: + post +description: + Submit a job to generate synthetic data using llm + reward model scoring + filtering +request: + # batch inference params + model: str + prompt_file_path: Path # jsonl style file where each line is a json encoded List[Message] + custom_id + options: Options + num_generations: int + # reward model scoring params + reward_model: str + scoring_function: ScoringFunction + # filtering params + filtering_function: FilteringFunction + metadata: json + +response: + synth_data_gen_job: SyntheticDataGenerationJob + +callsite: + /synthetic_data_gen/job_status +request_type: + get +description: + Get status for an already submitted job +request: + job_id: str # unique identifier for the job +response: + synth_data_gen_job: SyntheticDataGenerationJob diff --git a/synthetic_data_generation.yaml b/synthetic_data_generation.yaml index 81e8f4965..f11786f47 100644 --- a/synthetic_data_generation.yaml +++ b/synthetic_data_generation.yaml @@ -1,12 +1,12 @@ openapi: 3.0.0 info: - title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring. + title: Synthetic Data Generation API version: 0.0.1 paths: - /synthetic_data_generation/submit_job: + /synthetic_data_gen/submit_job: post: - summary: Submit a job for synthetic data generation. - description: Batch Inference > Reward Scoring > Filtering > Response + summary: Submit a job to generate synthetic data + description: Submit a job to generate synthetic data using llm + reward model scoring + filtering requestBody: required: true content: @@ -14,69 +14,47 @@ paths: schema: type: object properties: - # batch inference params model: type: string - description: Model identifier for batch inference. - prompts_path: + description: Model used for batch inference + prompt_file_path: type: string - description: Path to prompts, JSONL for batch inference - batch_size: - type: integer - description: Number of prompts to process in each batch. - # TODO: May-be put all these generation related params in a struct - temperature: - type: number - format: float - description: Temperature parameter for generation. - top_p: - type: number - format: float - description: Top-p parameter for generation. - max_gen_len: - type: integer - description: Maximum length of generated responses. + format: path + description: Path to the JSONL file containing message_lists and custom IDs + options: + $ref: '#/components/schemas/Options' num_generations: type: integer - description: Number of generations per prompt. - # reward model scoring params + description: Number of generations to produce reward_model: type: string - description: Identifier for the reward model used for scoring. + description: Model used for scoring scoring_function: - type: string - description: Scoring function to apply. - # params for filtering responses - # filtering function will have a signature as - # def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ... + $ref: '#/components/schemas/ScoringFunction' filtering_function: + $ref: '#/components/schemas/FilteringFunction' + metadata: type: object - properties: - name: - type: string - description: Name of the filtering function, can be a simple threshold or a pre-registered function. - params: - type: object - additionalProperties: true - description: JSON object containing parameters for the filtering function. + additionalProperties: true + description: Additional metadata for the job responses: '200': - description: Job successfully created and processing. + description: Job successfully submitted content: application/json: schema: - $ref: '#/components/schemas/SyntheticDataGeneration' + $ref: '#/components/schemas/SyntheticDataGenerationJob' -/synthetic_data_generation/job_status: + /synthetic_data_gen/job_status: get: - summary: Get the status of a submitted job - description: Get the status of a submitted job + summary: Get job status + description: Get status for an already submitted job parameters: - in: query name: job_id - required: true schema: type: string + required: true description: Unique identifier for the job responses: '200': @@ -84,58 +62,70 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/SyntheticDataGeneration' - '400': - description: Invalid job ID provided - '404': - description: Job not found - + $ref: '#/components/schemas/SyntheticDataGenerationJob' components: schemas: - PromptResponseScore: + FilteringFunction: type: object properties: - id: + name: type: string - description: Carry forwarded from the user provided id from prompt. + description: Name of the filtering function + params: + type: object + additionalProperties: true + description: JSON object containing parameters for the filtering function + SyntheticDataPoint: + type: object + properties: + custom_id: + type: string + description: Custom identifier for the data point index: type: integer - description: Index of the generation. + description: Index of the data point prompt: type: array items: $ref: '#/components/schemas/Message' + description: List of messages used as prompt response: - $ref: '#/components/schemas/Completion' + $ref: '#/components/schemas/Message' + logprob: + type: number + format: float + description: Log probability of the response score: type: number format: float - description: Final score after filtering. - raw_score: - type: number - format: float - description: Raw score from the reward model. - SyntheticDataGeneration: + description: Score of the response based on the reward model + SyntheticDataGenerationJob: type: object properties: job_id: type: string - description: Unique identifier for the job. + description: ID provided by the API created: type: string format: date-time - description: Timestamp when the job was created. + description: Timestamp when the job was created status: type: string - description: Current status of the job, can indicate the stage or success/failure. - output_file_path: + enum: [validating, running, completed, failed] + description: Current status of the job + input_file_path: type: string - description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object. - Message: - type: object - properties: - # As Defined in /batch_inference - Completion: - type: object - properties: - # As Defined in /batch_inference + format: path + description: Path to the input JSONL file + success_file_path: + type: string + format: path + description: Path to the JSONL file containing successful results + error_file_path: + type: string + format: path + description: Path to the JSONL file containing errors + metadata: + type: object + additionalProperties: true + description: Additional metadata about the job