From 2250ab7274671f6d22a8bf83ab2091f4994f5609 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 26 Feb 2025 18:12:11 -0500 Subject: [PATCH] fix: don't attempt to clean gpu memory up when device is cpu (#1191) This is a follow up to: https://github.com/meta-llama/llama-stack/pull/1140 Signed-off-by: Ihar Hrachyshka # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] Avoid unnecessary GPU memory clean attempt when the GPU is not used for training. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan With CPU: ``` INFO 2025-02-26 16:43:56,267 torchtune.utils._logging:121: Model checkpoint of size 6.43 GB saved to /Users/ihrachys/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0/consolidated.00.pth INFO 2025-02-26 16:43:56,274 torchtune.utils._logging:132: Adapter checkpoint of size 0.00 GB saved to /Users/ihrachys/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0/adapter/adapter.pth model_file_path /Users/ihrachys/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0 ``` With CUDA: ``` INFO 2025-02-26 21:39:24,314 torchtune.utils._logging:121: Model checkpoint of size 6.43 GB saved to /home/ec2-user/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0/consolidated.00.pth INFO 2025-02-26 21:39:24,333 torchtune.utils._logging:132: Adapter checkpoint of size 0.00 GB saved to /home/ec2-user/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0/adapter/adapter.pth model_file_path /home/ec2-user/.llama/checkpoints/meta-llama/Llama-3.2-3B-Instruct-sft-0 ``` [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- .../torchtune/recipes/lora_finetuning_single_device.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py index 6e5ec6050..41387474f 100644 --- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py @@ -549,10 +549,11 @@ class LoraFinetuningSingleDevice: checkpoints.append(checkpoint) # clean up the memory after training finishes - self._model.to("cpu") + if self._device.type != "cpu": + self._model.to("cpu") + torch.cuda.empty_cache() del self._model gc.collect() - torch.cuda.empty_cache() return (memory_stats, checkpoints)