updates to synth data apis

2025-12-05 18:27:22 +00:00 · 2024-06-26 16:48:52 -07:00 · 2024-06-26 16:48:52 -07:00 · 157e5ddf2e
commit 157e5ddf2e
parent c9a75c4628
2 changed files with 123 additions and 75 deletions
--- a/simple_view/synthetic_data_generation.yml
+++ b/simple_view/synthetic_data_generation.yml
@ -0,0 +1,58 @@
+# Synthetic Data Generation API
+== Schema ==
+
+FilteringFunction:
+  name: str
+  params: json
+
+SyntheticDataPoint:
+  custom_id: str
+  index: int
+  prompt: List[Message]
+  response: Message
+  logprob: float
+  score: float
+
+SyntheticDataGenerationJob:
+  job_id: str  # id provided by the api
+  created: string # format - date-time
+  status: string  # enum (validating, running, completed, failed)
+  input_file_path: Path  # jsonl style file where each row contains custom_id and message_list
+  success_file_path: Path  # jsonl each line is SyntheticDataPoint
+  error_file_path: Path  # custom_ids where we failed with some info
+  metadata: json
+
+== Callsites ==
+
+callsite:
+  /synthetic_data_gen/submit_job
+request_type:
+  post
+description:
+  Submit a job to generate synthetic data using llm + reward model scoring + filtering
+request:
+  # batch inference params
+  model: str
+  prompt_file_path: Path  # jsonl style file where each line is a json encoded List[Message] + custom_id
+  options: Options
+  num_generations: int
+  # reward model scoring params
+  reward_model: str
+  scoring_function: ScoringFunction
+  # filtering params
+  filtering_function: FilteringFunction
+  metadata: json
+
+response:
+  synth_data_gen_job: SyntheticDataGenerationJob
+
+callsite:
+  /synthetic_data_gen/job_status
+request_type:
+  get
+description:
+  Get status for an already submitted job
+request:
+  job_id: str  # unique identifier for the job
+response:
+  synth_data_gen_job: SyntheticDataGenerationJob
--- a/synthetic_data_generation.yaml
+++ b/synthetic_data_generation.yaml
@ -1,12 +1,12 @@
 openapi: 3.0.0
 info:
-  title: API for Synthetic Data Generation. This combines other serivces like batch inference and reward model scoring.
+  title: Synthetic Data Generation API
  version: 0.0.1
 paths:
-  /synthetic_data_generation/submit_job:
+  /synthetic_data_gen/submit_job:
    post:
-      summary: Submit a job for synthetic data generation.
-      description: Batch Inference > Reward Scoring > Filtering > Response
+      summary: Submit a job to generate synthetic data
+      description: Submit a job to generate synthetic data using llm + reward model scoring + filtering
      requestBody:
        required: true
        content:
@ -14,69 +14,47 @@ paths:
            schema:
              type: object
              properties:
-                # batch inference params
                model:
                  type: string
-                  description: Model identifier for batch inference.
-                prompts_path:
+                  description: Model used for batch inference
+                prompt_file_path:
                  type: string
-                  description: Path to prompts, JSONL for batch inference
-                batch_size:
-                  type: integer
-                  description: Number of prompts to process in each batch.
-                # TODO: May-be put all these generation related params in a struct
-                temperature:
-                  type: number
-                  format: float
-                  description: Temperature parameter for generation.
-                top_p:
-                  type: number
-                  format: float
-                  description: Top-p parameter for generation.
-                max_gen_len:
-                  type: integer
-                  description: Maximum length of generated responses.
+                  format: path
+                  description: Path to the JSONL file containing message_lists and custom IDs
+                options:
+                  $ref: '#/components/schemas/Options'
                num_generations:
                  type: integer
-                  description: Number of generations per prompt.
-                # reward model scoring params
+                  description: Number of generations to produce
                reward_model:
                  type: string
-                  description: Identifier for the reward model used for scoring.
+                  description: Model used for scoring
                scoring_function:
-                  type: string
-                  description: Scoring function to apply.
-                # params for filtering responses
-                # filtering function will have a signature as
-                # def filter_responses(List[PromptResponseScore]) --> List[PromptResponseScore]: ...
+                  $ref: '#/components/schemas/ScoringFunction'
                filtering_function:
-                  type: object
-                  properties:
-                    name:
-                      type: string
-                      description: Name of the filtering function, can be a simple threshold or a pre-registered function.
-                    params:
+                  $ref: '#/components/schemas/FilteringFunction'
+                metadata:
                  type: object
                  additionalProperties: true
-                      description: JSON object containing parameters for the filtering function.
+                  description: Additional metadata for the job
      responses:
        '200':
-          description: Job successfully created and processing.
+          description: Job successfully submitted
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/SyntheticDataGeneration'
+                $ref: '#/components/schemas/SyntheticDataGenerationJob'

-/synthetic_data_generation/job_status:
+  /synthetic_data_gen/job_status:
    get:
-      summary: Get the status of a submitted job
-      description: Get the status of a submitted job
+      summary: Get job status
+      description: Get status for an already submitted job
      parameters:
        - in: query
          name: job_id
-          required: true
          schema:
            type: string
+          required: true
          description: Unique identifier for the job
      responses:
        '200':
@ -84,58 +62,70 @@ paths:
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/SyntheticDataGeneration'
-        '400':
-          description: Invalid job ID provided
-        '404':
-          description: Job not found
-
+                $ref: '#/components/schemas/SyntheticDataGenerationJob'
 components:
  schemas:
-    PromptResponseScore:
+    FilteringFunction:
      type: object
      properties:
-        id:
+        name:
          type: string
-          description: Carry forwarded from the user provided id from prompt.
+          description: Name of the filtering function
+        params:
+          type: object
+          additionalProperties: true
+          description: JSON object containing parameters for the filtering function
+    SyntheticDataPoint:
+      type: object
+      properties:
+        custom_id:
+          type: string
+          description: Custom identifier for the data point
        index:
          type: integer
-          description: Index of the generation.
+          description: Index of the data point
        prompt:
          type: array
          items:
            $ref: '#/components/schemas/Message'
+          description: List of messages used as prompt
        response:
-          $ref: '#/components/schemas/Completion'
+          $ref: '#/components/schemas/Message'
+        logprob:
+          type: number
+          format: float
+          description: Log probability of the response
        score:
          type: number
          format: float
-          description: Final score after filtering.
-        raw_score:
-          type: number
-          format: float
-          description: Raw score from the reward model.
-    SyntheticDataGeneration:
+          description: Score of the response based on the reward model
+    SyntheticDataGenerationJob:
      type: object
      properties:
        job_id:
          type: string
-          description: Unique identifier for the job.
+          description: ID provided by the API
        created:
          type: string
          format: date-time
-          description: Timestamp when the job was created.
+          description: Timestamp when the job was created
        status:
          type: string
-          description: Current status of the job, can indicate the stage or success/failure.
-        output_file_path:
+          enum: [validating, running, completed, failed]
+          description: Current status of the job
+        input_file_path:
          type: string
-          description: Path to the output jsonl file where each row is a json encoded PromptResponseScore object.
-    Message:
+          format: path
+          description: Path to the input JSONL file
+        success_file_path:
+          type: string
+          format: path
+          description: Path to the JSONL file containing successful results
+        error_file_path:
+          type: string
+          format: path
+          description: Path to the JSONL file containing errors
+        metadata:
          type: object
-      properties:
-        # As Defined in /batch_inference
-    Completion:
-      type: object
-      properties:
-        # As Defined in /batch_inference
+          additionalProperties: true
+          description: Additional metadata about the job