From 97d54778a37cd9e913eb614b49aaf4a85a859dea Mon Sep 17 00:00:00 2001 From: James Kunstle Date: Thu, 20 Mar 2025 18:56:52 -0700 Subject: [PATCH] TMP test launching distributed training from inline provider Signed-off-by: James Kunstle --- .../fullprecision_finetuning_multi_device.py | 17 ++++++++++++++--- .../huggingface_ilab/recipes/train.py | 12 ++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 llama_stack/providers/inline/post_training/huggingface_ilab/recipes/train.py diff --git a/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py index 57e41f753..0a3098aec 100644 --- a/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py +++ b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py @@ -1,4 +1,5 @@ import asyncio +import pathlib import tempfile import typing from asyncio import subprocess @@ -288,9 +289,19 @@ class FullPrecisionFineTuning: set_subproc_ref_callback (Callable[[subprocess.Process], None]): Sets subprocess reference in 'Impl' class' ref to this job """ - training_subproc = await asyncio.create_subprocess_shell( - 'echo "yay Im running in a subprocess: $$"; sleep 5; echo "exiting subprocess $$"' - ) + # assumes that SPMD training file is next to current file + train_file = pathlib.Path(__file__).resolve() / "train.py" + NGPU = 2 + + command = f""" + torchrun \ + --nproc_per_node {NGPU} \ + --rdzv_backend gloo \ + --rdzv_endpoint="localhost:0" \ + {train_file} \ + """ + + training_subproc = await asyncio.create_subprocess_shell(cmd=command) set_subproc_ref_callback(training_subproc) await training_subproc.wait() set_status_callback(JobStatus.completed) diff --git a/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/train.py b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/train.py new file mode 100644 index 000000000..4c06e04c5 --- /dev/null +++ b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/train.py @@ -0,0 +1,12 @@ +import os +import time + +CURRENT_LOCAL_RANK = os.getenv("LOCAL_RANK", "UNKNOWN") +CURRENT_RANK = os.getenv("RANK", "UNKNOWN") +CURRENT_WS = os.getenv("WORLD_SIZE", "UNKNOWN") + +print(f"Hello from training script! LR:({CURRENT_LOCAL_RANK}) R:({CURRENT_RANK}) WS:({CURRENT_WS})") + +for i in range(30): + print(f"LR:({CURRENT_LOCAL_RANK}) | {i}") + time.sleep(1)