From 52a21ce78fb24e527b502f3aabf2672f82a68c40 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Tue, 14 Jan 2025 19:19:38 -0800 Subject: [PATCH] Free up memory after post training finishes (#770) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## context Currently, the GPU memory will be continuously occupied after the training finishes. In this PR, we explicitly delete the reference and clean up the memory after training finishes. ## test Before the change, after training a llama 3.2 3B model, >6GB GPU memory is still occupied After the change, after training a llama 3.2 3B model, the GPU memory drops to ~1GB Screenshot 2025-01-14 at 6 05 17 PM --- .../torchtune/recipes/lora_finetuning_single_device.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py index 7543b1f4e..80e206ebb 100644 --- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import gc import logging import os import time @@ -580,6 +581,12 @@ class LoraFinetuningSingleDevice: checkpoint.training_metrics = training_metrics checkpoints.append(checkpoint) + # clean up the memory after training finishes + self._model.to("cpu") + del self._model + gc.collect() + torch.cuda.empty_cache() + return (memory_stats, checkpoints) async def validation(self) -> Tuple[float, float]: