From 52a21ce78fb24e527b502f3aabf2672f82a68c40 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Tue, 14 Jan 2025 19:19:38 -0800
Subject: [PATCH] Free up memory after post training finishes (#770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## context
Currently, the GPU memory will be continuously occupied after the
training finishes. In this PR, we explicitly delete the reference and
clean up the memory after training finishes.

## test
Before the change, after training a llama 3.2 3B model, >6GB GPU memory
is still occupied

After the change, after training a llama 3.2 3B model, the GPU memory
drops to ~1GB

<img width="156" alt="Screenshot 2025-01-14 at 6 05 17 PM"
src="https://github.com/user-attachments/assets/45d212b1-a651-49f3-aad9-1c0a27fcebcf"
/>
---
 .../torchtune/recipes/lora_finetuning_single_device.py     | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index 7543b1f4e..80e206ebb 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import gc
 import logging
 import os
 import time
@@ -580,6 +581,12 @@ class LoraFinetuningSingleDevice:
                 checkpoint.training_metrics = training_metrics
             checkpoints.append(checkpoint)
 
+        # clean up the memory after training finishes
+        self._model.to("cpu")
+        del self._model
+        gc.collect()
+        torch.cuda.empty_cache()
+
         return (memory_stats, checkpoints)
 
     async def validation(self) -> Tuple[float, float]: