updates to synth data apis

2025-07-16 09:58:10 +00:00 · 2024-06-26 16:48:52 -07:00 · 2024-06-26 16:48:52 -07:00 · 157e5ddf2e
commit 157e5ddf2e
parent c9a75c4628
2 changed files with 123 additions and 75 deletions
--- a/simple_view/synthetic_data_generation.yml
+++ b/simple_view/synthetic_data_generation.yml
@ -0,0 +1,58 @@
 # Synthetic Data Generation API
 == Schema ==
 FilteringFunction:
  name: str
  params: json
 SyntheticDataPoint:
  custom_id: str
  index: int
  prompt: List[Message]
  response: Message
  logprob: float
  score: float
 SyntheticDataGenerationJob:
  job_id: str  # id provided by the api
  created: string # format - date-time
  status: string  # enum (validating, running, completed, failed)
  input_file_path: Path  # jsonl style file where each row contains custom_id and message_list
  success_file_path: Path  # jsonl each line is SyntheticDataPoint
  error_file_path: Path  # custom_ids where we failed with some info
  metadata: json
 == Callsites ==
 callsite:
  /synthetic_data_gen/submit_job
 request_type:
  post
 description:
  Submit a job to generate synthetic data using llm + reward model scoring + filtering
 request:
  # batch inference params
  model: str
  prompt_file_path: Path  # jsonl style file where each line is a json encoded List[Message] + custom_id
  options: Options
  num_generations: int
  # reward model scoring params
  reward_model: str
  scoring_function: ScoringFunction
  # filtering params
  filtering_function: FilteringFunction
  metadata: json
 response:
  synth_data_gen_job: SyntheticDataGenerationJob
 callsite:
  /synthetic_data_gen/job_status
 request_type:
  get
 description:
  Get status for an already submitted job
 request:
  job_id: str  # unique identifier for the job
 response:
  synth_data_gen_job: SyntheticDataGenerationJob
--- a/synthetic_data_generation.yaml
+++ b/synthetic_data_generation.yaml
@ -1,12 +1,12 @@
 openapi: 3.0.0
 info:
-  title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring.
+  title: Synthetic Data Generation API
  version: 0.0.1
 paths:
-  /synthetic_data_generation/submit_job:
+  /synthetic_data_gen/submit_job:
    post:
-      summary: Submit a job for synthetic data generation.
+      summary: Submit a job to generate synthetic data
-      description: Batch Inference > Reward Scoring > Filtering > Response
+      description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
      requestBody:
        required: true
        content:
@ -14,69 +14,47 @@ paths:
            schema:
              type: object
              properties:
                # batch inference params
                model:
                  type: string
-                  description: Model identifier for batch inference.
+                  description: Model used for batch inference
-                prompts_path:
+                prompt_file_path:
                  type: string
-                  description: Path to prompts, JSONL for batch inference
+                  format: path
-                batch_size:
+                  description: Path to the JSONL file containing message_lists and custom IDs
-                  type: integer
+                options:
-                  description: Number of prompts to process in each batch.
+                  $ref: '#/components/schemas/Options'
                # TODO: May-be put all these generation related params in a struct
                temperature:
                  type: number
                  format: float
                  description: Temperature parameter for generation.
                top_p:
                  type: number
                  format: float
                  description: Top-p parameter for generation.
                max_gen_len:
                  type: integer
                  description: Maximum length of generated responses.
                num_generations:
                  type: integer
-                  description: Number of generations per prompt.
+                  description: Number of generations to produce
                # reward model scoring params
                reward_model:
                  type: string
-                  description: Identifier for the reward model used for scoring.
+                  description: Model used for scoring
                scoring_function:
-                  type: string
+                  $ref: '#/components/schemas/ScoringFunction'
                  description: Scoring function to apply.
                # params for filtering responses
                # filtering function will have a signature as
                # def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
                filtering_function:
                  $ref: '#/components/schemas/FilteringFunction'
                metadata:
                  type: object
-                  properties:
+                  additionalProperties: true
-                    name:
+                  description: Additional metadata for the job
                      type: string
                      description: Name of the filtering function, can be a simple threshold or a pre-registered function.
                    params:
                      type: object
                      additionalProperties: true
                      description: JSON object containing parameters for the filtering function.
      responses:
        '200':
-          description: Job successfully created and processing.
+          description: Job successfully submitted
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/SyntheticDataGeneration'
+                $ref: '#/components/schemas/SyntheticDataGenerationJob'
-/synthetic_data_generation/job_status:
+  /synthetic_data_gen/job_status:
    get:
-      summary: Get the status of a submitted job
+      summary: Get job status
-      description: Get the status of a submitted job
+      description: Get status for an already submitted job
      parameters:
        - in: query
          name: job_id
          required: true
          schema:
            type: string
          required: true
          description: Unique identifier for the job
      responses:
        '200':
@ -84,58 +62,70 @@ paths:
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/SyntheticDataGeneration'
+                $ref: '#/components/schemas/SyntheticDataGenerationJob'
        '400':
          description: Invalid job ID provided
        '404':
          description: Job not found
 components:
  schemas:
-    PromptResponseScore:
+    FilteringFunction:
      type: object
      properties:
-        id:
+        name:
          type: string
-          description: Carry forwarded from the user provided id from prompt.
+          description: Name of the filtering function
        params:
          type: object
          additionalProperties: true
          description: JSON object containing parameters for the filtering function
    SyntheticDataPoint:
      type: object
      properties:
        custom_id:
          type: string
          description: Custom identifier for the data point
        index:
          type: integer
-          description: Index of the generation.
+          description: Index of the data point
        prompt:
          type: array
          items:
            $ref: '#/components/schemas/Message'
          description: List of messages used as prompt
        response:
-          $ref: '#/components/schemas/Completion'
+          $ref: '#/components/schemas/Message'
        logprob:
          type: number
          format: float
          description: Log probability of the response
        score:
          type: number
          format: float
-          description: Final score after filtering.
+          description: Score of the response based on the reward model
-        raw_score:
+    SyntheticDataGenerationJob:
          type: number
          format: float
          description: Raw score from the reward model.
    SyntheticDataGeneration:
      type: object
      properties:
        job_id:
          type: string
-          description: Unique identifier for the job.
+          description: ID provided by the API
        created:
          type: string
          format: date-time
-          description: Timestamp when the job was created.
+          description: Timestamp when the job was created
        status:
          type: string
-          description: Current status of the job, can indicate the stage or success/failure.
+          enum: [validating, running, completed, failed]
-        output_file_path:
+          description: Current status of the job
        input_file_path:
          type: string
-          description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object.
+          format: path
-    Message:
+          description: Path to the input JSONL file
-      type: object
+        success_file_path:
-      properties:
+          type: string
-        # As Defined in /batch_inference
+          format: path
-    Completion:
+          description: Path to the JSONL file containing successful results
-      type: object
+        error_file_path:
-      properties:
+          type: string
-        # As Defined in /batch_inference
+          format: path
          description: Path to the JSONL file containing errors
        metadata:
          type: object
          additionalProperties: true
          description: Additional metadata about the job