fine tuning apis

2025-06-28 19:04:19 +00:00 · 2024-06-26 20:37:22 -07:00 · 2024-06-26 20:37:22 -07:00 · 2478b76fbc
commit 2478b76fbc
parent 157e5ddf2e
2 changed files with 400 additions and 0 deletions
--- a/fine_tuning.yaml
+++ b/fine_tuning.yaml
@ -0,0 +1,266 @@
+openapi: 3.0.0
+info:
+  title: Fine Tuning API
+  version: 1.0.0
+  description: API for managing fine tuning jobs for machine learning models.
+
+paths:
+  /fine_tuning/jobs/submit:
+    post:
+      summary: Submit a fine tuning job
+      description: Submit a fine tuning job with the specified configuration.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/Config'
+      responses:
+        200:
+          description: Successfully submitted the fine tuning job.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/FineTuningJob'
+
+  /fine_tuning/jobs/status:
+    get:
+      summary: Gets last N fine tuning jobs
+      description: Retrieve the status of the last N fine tuning jobs based on the provided job ID.
+      parameters:
+        - in: query
+          name: job_id
+          schema:
+            type: string
+          required: true
+          description: The ID of the job to retrieve status for.
+      responses:
+        200:
+          description: Successfully retrieved the job status.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/FineTuningJob'
+
+  /fine_tuning/jobs/cancel:
+    post:
+      summary: Cancel provided job
+      description: Cancel the fine tuning job with the specified job ID.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                job_id:
+                  type: string
+      responses:
+        200:
+          description: Successfully cancelled the fine tuning job.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/FineTuningJob'
+
+  /fine_tuning/jobs/tail:
+    get:
+      summary: Tail logs of a particular job
+      description: Stream the logs of a particular job in real-time. This endpoint supports streaming responses.
+      parameters:
+        - in: query
+          name: job_id
+          schema:
+            type: string
+          required: true
+          description: The ID of the job to tail logs for.
+      responses:
+        200:
+          description: Streaming logs in real-time.
+          content:
+            application/x-ndjson:
+              schema:
+                type: object
+                properties:
+                  logs:
+                    type: array
+                    items:
+                      $ref: '#/components/schemas/Log'
+          headers:
+            Content-Type:
+              schema:
+                type: string
+                default: 'application/x-ndjson'
+            Transfer-Encoding:
+              schema:
+                type: string
+                default: 'chunked'
+
+components:
+  schemas:
+    Message:
+      # keep in sync with /chat_completion
+
+    TrainingDataItem:
+      type: object
+      properties:
+        dialog:
+          type: array
+          items:
+            $ref: '#/components/schemas/Message'
+        keep_loss:
+          type: array
+          items:
+            type: boolean
+
+    WandBLogger:
+      type: object
+      properties:
+        project:
+          type: string
+          description: The project name in WandB where logs will be stored.
+
+    DiskLogger:
+      type: object
+      properties:
+        filename:
+          type: string
+          description: The filename where logs will be stored on disk.
+
+    FullFineTuneOptions:
+      type: object
+      properties:
+        enable_activation_checkpointing:
+          type: boolean
+          default: true
+        memory_efficient_fsdp_wrap:
+          type: boolean
+          default: true
+        fsdp_cpu_offload:
+          type: boolean
+          default: true
+
+    LoraFineTuneOptions:
+      type: object
+      properties:
+        lora_attn_modules:
+          type: array
+          items:
+            type: string
+        apply_lora_to_mlp:
+          type: boolean
+          default: false
+        apply_lora_to_output:
+          type: boolean
+          default: false
+        lora_rank:
+          type: integer
+        lora_alpha:
+          type: integer
+
+    FineTuningOptions:
+      type: object
+      properties:
+        n_epochs:
+          type: integer
+        batch_size:
+          type: integer
+        lr:
+          type: number
+          format: float
+        gradient_accumulation_steps:
+          type: integer
+        seed:
+          type: integer
+        shuffle:
+          type: boolean
+        custom_training_options:
+          oneOf:
+            - $ref: '#/components/schemas/FullFineTuneOptions'
+            - $ref: '#/components/schemas/LoraFineTuneOptions'
+          discriminator:
+            propertyName: finetuning_type
+        extras:
+          # json to put other config overrides that are required by torchtune
+          type: object
+          additionalProperties: true
+
+    Config:
+      type: object
+      properties:
+        model:
+          type: string
+          description: The model identifier that you want to fine tune.
+        data:
+          type: string
+          format: uri
+          description: Path to the JSONL file with each row representing a TrainingDataItem.
+        validation_data:
+          type: string
+          format: uri
+          description: Path to the JSONL file used for validation metrics.
+        fine_tuning_options:
+          $ref: '#/components/schemas/FineTuningOptions'
+        logger:
+          oneOf:
+            - $ref: '#/components/schemas/DiskLogger'
+            - $ref: '#/components/schemas/WandBLogger'
+          discriminator:
+            propertyName: log_type
+        overrides:
+          # eg. --nproc_per_node 4 instead of default that we need to pass through to torchrun
+          # when running locally
+          type: string
+          description: Custom override options for the fine tuning process.
+        metadata:
+          type: object
+          additionalProperties: true
+
+    FineTuningJob:
+      type: object
+      properties:
+        job_id:
+          type: string
+          description: Unique identifier for the fine tuning job.
+        created:
+          type: string
+          format: date-time
+          description: The creation date and time of the job.
+        finished_at:
+          type: string
+          format: date-time
+          description: The completion date and time of the job.
+        status:
+          type: string
+          enum: [validation, queued, running, failed, success, cancelled]
+          description: The current status of the job.
+        error_path:
+          type: string
+          format: uri
+          description: Path to the error log file.
+        checkpoints:
+          type: array
+          items:
+            type: string
+            format: uri
+          description: List of paths to checkpoint files for various epochs.
+        logs:
+          type: string
+          format: uri
+          description: Path to the logs, either local or a WandB URI.
+        input_config:
+          $ref: '#/components/schemas/Config'
+        metadata:
+          type: object
+          additionalProperties: true
+
+    Log:
+      type: object
+      properties:
+        message:
+          type: string
+          description: The log message.
+        timestamp:
+          type: string
+          format: date-time
+          description: The timestamp of the log message.
--- a/simple_view/fine_tuning.yml
+++ b/simple_view/fine_tuning.yml
@ -0,0 +1,134 @@
+# Fine Tuning APIs
+== Schema ==
+
+TrainingDataItem:
+  dialog: List[Message]
+  keep_loss: List[bool]
+
+
+WandBLogger:
+  project: str
+
+DiskLogger:
+  # log_dir will be pre-configured in environment
+  filename: str
+
+FullFineTuneOptions:
+  enable_activation_checkpointing: True
+  memory_efficient_fsdp_wrap: True
+  fsdp_cpu_offload: True
+
+LoraFineTuneOptions:
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+FineTuningOptions:
+  n_epochs: int
+  batch_size: int
+  lr: float
+  gradient_accumulation_steps: int
+  seed: int
+  shuffle: bool
+
+  # Unions in OpenAPI with a reference field that can help disambiguate
+  custom_training_options:
+    discriminator:
+      propertyName: fine_tuning_type
+    mapping:
+      fft: FullFineTuneOptions
+      lora: LoraFineTuneOptions
+
+  # other options that can be passed in
+  extras: json
+
+Config:
+  model: str # model that you want to fine tune
+  data: Path  # jsonl with each row representing a TrainingDataItem
+  validation_data: Path  # same as data but to get validation metrics on
+
+  # fine tuning args
+  fine_tuning_options: FineTuningOptions
+
+  # metric logging
+  logger:
+    discriminator:
+      propertyName: log_type
+    mapping:
+      disk: DiskLogger
+      wandb: WandBLogger
+
+  # Override options
+  # eg. --nproc_per_node 4 insted of defaults,
+  # this might be impl specific and can allow for various customizations
+  overrides: str
+  metadata: json  # to carry over to job details
+
+FineTuningJob:
+  job_id: str
+  created: str  # format date-time
+  finished_at: str  # format date-time
+  status: str  # enum - validation, queued, running, failed, success, cancelled
+  error_path: Path  # error logging
+  checkpoints: List[Path]  # checkpoints for various epochs
+  logs: Path  # local path / wandb uri
+  input_config: Config  # config used to submit this job
+  metadata: json  # carried over rom user provided input
+
+Log:
+  message: string  # The log message.
+  timestamp: string  # format: date-time
+
+== Callsites ==
+
+callsite:
+  /fine_tuning/jobs/submit
+request_type:
+  post
+description:
+  Submit a fine tuning job
+request:
+  config: Config
+response:
+  fine_tuning_job: FineTuningJob
+
+
+callsite:
+  /fine_tuning/jobs/status
+request_type:
+  get
+description:
+  Gets last N fine tuning jobs
+request:
+  job_id: str
+response:
+  fine_tuning_job: FineTuningJob
+
+
+callsite:
+  /fine_tuning/jobs/cancel
+request_type:
+  post
+description:
+  Cancel provided job
+request:
+  job_id: str
+response:
+  fine_tuning_job: FineTuningJob
+
+
+callsite:
+  /fine_tuning/jobs/tail
+request_type:
+  get
+description:
+  Tail logs of a particular job
+request:
+  job_id: str
+response:
+  logs: List[Log]
+  streaming:
+    enabled: True
+    chunkSize: 1024