llama-stack-mirror/simple_view/synthetic_data_generation.yml

# Synthetic Data Generation API
== Schema ==

FilteringFunction:
  name: str
  params: json

SyntheticDataPoint:
  custom_id: str
  index: int
  prompt: List[Message]
  response: Message
  logprob: float
  score: float

SyntheticDataGenerationJob:
  job_id: str  # id provided by the api
  created: string # format - date-time
  status: string  # enum (validating, running, completed, failed)
  input_file_path: Path  # jsonl style file where each row contains custom_id and message_list
  success_file_path: Path  # jsonl each line is SyntheticDataPoint
  error_file_path: Path  # custom_ids where we failed with some info
  metadata: json

== Callsites ==

callsite:
  /synthetic_data_gen/submit_job
request_type:
  post
description:
  Submit a job to generate synthetic data using llm + reward model scoring + filtering
request:
  # batch inference params
  model: str
  prompt_file_path: Path  # jsonl style file where each line is a json encoded List[Message] + custom_id
  options: Options
  num_generations: int
  # reward model scoring params
  reward_model: str
  scoring_function: ScoringFunction
  # filtering params
  filtering_function: FilteringFunction
  metadata: json

response:
  synth_data_gen_job: SyntheticDataGenerationJob

callsite:
  /synthetic_data_gen/job_status
request_type:
  get
description:
  Get status for an already submitted job
request:
  job_id: str  # unique identifier for the job
response:
  synth_data_gen_job: SyntheticDataGenerationJob