mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
Free up memory after post training finishes (#770)
## context Currently, the GPU memory will be continuously occupied after the training finishes. In this PR, we explicitly delete the reference and clean up the memory after training finishes. ## test Before the change, after training a llama 3.2 3B model, >6GB GPU memory is still occupied After the change, after training a llama 3.2 3B model, the GPU memory drops to ~1GB <img width="156" alt="Screenshot 2025-01-14 at 6 05 17 PM" src="https://github.com/user-attachments/assets/45d212b1-a651-49f3-aad9-1c0a27fcebcf" />
This commit is contained in:
parent
b2b82d4a90
commit
52a21ce78f
1 changed files with 7 additions and 0 deletions
|
@ -4,6 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import gc
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
@ -580,6 +581,12 @@ class LoraFinetuningSingleDevice:
|
||||||
checkpoint.training_metrics = training_metrics
|
checkpoint.training_metrics = training_metrics
|
||||||
checkpoints.append(checkpoint)
|
checkpoints.append(checkpoint)
|
||||||
|
|
||||||
|
# clean up the memory after training finishes
|
||||||
|
self._model.to("cpu")
|
||||||
|
del self._model
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return (memory_stats, checkpoints)
|
return (memory_stats, checkpoints)
|
||||||
|
|
||||||
async def validation(self) -> Tuple[float, float]:
|
async def validation(self) -> Tuple[float, float]:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue