mirror of
				https://github.com/meta-llama/llama-stack.git
				synced 2025-10-25 09:05:37 +00:00 
			
		
		
		
	# What does this PR do? adds an inline HF SFTTrainer provider. Alongside touchtune -- this is a super popular option for running training jobs. The config allows a user to specify some key fields such as a model, chat_template, device, etc the provider comes with one recipe `finetune_single_device` which works both with and without LoRA. any model that is a valid HF identifier can be given and the model will be pulled. this has been tested so far with CPU and MPS device types, but should be compatible with CUDA out of the box The provider processes the given dataset into the proper format, establishes the various steps per epoch, steps per save, steps per eval, sets a sane SFTConfig, and runs n_epochs of training if checkpoint_dir is none, no model is saved. If there is a checkpoint dir, a model is saved every `save_steps` and at the end of training. ## Test Plan re-enabled post_training integration test suite with a singular test that loads the simpleqa dataset: https://huggingface.co/datasets/llamastack/simpleqa and a tiny granite model: https://huggingface.co/ibm-granite/granite-3.3-2b-instruct. The test now uses the llama stack client and the proper post_training API runs one step with a batch_size of 1. This test runs on CPU on the Ubuntu runner so it needs to be a small batch and a single step. [//]: # (## Documentation) --------- Signed-off-by: Charlie Doern <cdoern@redhat.com>
		
			
				
	
	
		
			35 lines
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			35 lines
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) Meta Platforms, Inc. and affiliates.
 | |
| # All rights reserved.
 | |
| #
 | |
| # This source code is licensed under the terms described in the LICENSE file in
 | |
| # the root directory of this source tree.
 | |
| 
 | |
| import gc
 | |
| 
 | |
| 
 | |
| def evacuate_model_from_device(model, device: str):
 | |
|     """Safely clear a model from memory and free device resources.
 | |
|     This function handles the proper cleanup of a model by:
 | |
|     1. Moving the model to CPU if it's on a non-CPU device
 | |
|     2. Deleting the model object to free memory
 | |
|     3. Running garbage collection
 | |
|     4. Clearing CUDA cache if the model was on a CUDA device
 | |
|     Args:
 | |
|         model: The PyTorch model to clear
 | |
|         device: The device type the model is currently on ('cuda', 'mps', 'cpu')
 | |
|     Note:
 | |
|         - For CUDA devices, this will clear the CUDA cache after moving the model to CPU
 | |
|         - For MPS devices, only moves the model to CPU (no cache clearing available)
 | |
|         - For CPU devices, only deletes the model object and runs garbage collection
 | |
|     """
 | |
|     if device != "cpu":
 | |
|         model.to("cpu")
 | |
| 
 | |
|     del model
 | |
|     gc.collect()
 | |
| 
 | |
|     if device == "cuda":
 | |
|         # we need to import such that this is only imported when the method is called
 | |
|         import torch
 | |
| 
 | |
|         torch.cuda.empty_cache()
 |