diff --git a/hf-ilab-build.yaml b/hf-ilab-build.yaml new file mode 100644 index 000000000..393646696 --- /dev/null +++ b/hf-ilab-build.yaml @@ -0,0 +1,25 @@ +version: "2" +distribution_spec: + description: Use (an external) Ollama server for running LLM inference + providers: + inference: + - remote::ollama + vector_io: + - inline::faiss + safety: + - inline::llama-guard + telemetry: + - inline::meta-reference + agents: + - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - inline::localfs + scoring: + - inline::llm-as-judge + tool_runtime: + - remote::brave-search + post_training: + - inline::huggingface-ilab +image_type: venv diff --git a/llama_stack/providers/inline/post_training/huggingface_ilab/post_training.py b/llama_stack/providers/inline/post_training/huggingface_ilab/post_training.py index 6f8f39e4b..e4ece902c 100644 --- a/llama_stack/providers/inline/post_training/huggingface_ilab/post_training.py +++ b/llama_stack/providers/inline/post_training/huggingface_ilab/post_training.py @@ -63,8 +63,10 @@ class HFilabPostTrainingImpl: if self.current_job is None: return True - finalized_job_states = [JobStatus.completed.value, JobStatus.failed.value] - if self.current_job.status in finalized_job_states: + finalized_job_states = [JobStatus.completed, JobStatus.failed] + + # check most recent status of job. + if self.current_job.status[-1] in finalized_job_states: return True return False @@ -87,7 +89,8 @@ class HFilabPostTrainingImpl: checkpoint_dir: Optional[str], algorithm_config: Optional[AlgorithmConfig], ) -> JSONResponse: - if not self.can_schedule_new_job(): + if not await self.can_schedule_new_job(): + # TODO: this status code isn't making its way up to the user. User just getting 500 from SDK. raise fastapi.HTTPException( status_code=503, # service unavailable, try again later. detail="A tuning job is currently running; this could take a while.", diff --git a/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py index c810ae5b7..57e41f753 100644 --- a/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py +++ b/llama_stack/providers/inline/post_training/huggingface_ilab/recipes/fullprecision_finetuning_multi_device.py @@ -7,6 +7,7 @@ from typing import Callable import datasets import transformers +from termcolor import cprint from transformers.configuration_utils import PretrainedConfig from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils_fast import PreTrainedTokenizerFast @@ -72,6 +73,10 @@ class FullPrecisionFineTuning: def logs_dir(self): return self.storage_dir / "logs" + @property + def hfcache_dir(self): + return self.storage_dir / "hf_cache" + @staticmethod def check_model_arch_validated(model_config: PretrainedConfig) -> bool: """Check whether input model architecture from config is among the pre-validated architectures. @@ -98,7 +103,9 @@ class FullPrecisionFineTuning: PretrainedConfig: model config associated with model. """ try: - model_config: PretrainedConfig = transformers.AutoConfig.from_pretrained(self.model_name_or_path) + model_config: PretrainedConfig = transformers.AutoConfig.from_pretrained( + self.model_name_or_path, cache_dir=self.hfcache_dir + ) except OSError: print( f"Attempted to load model config for ({self.model_name_or_path}) but failed. Model config will be loaded by `AutoConfig.from_pretrained()`" @@ -115,7 +122,7 @@ class FullPrecisionFineTuning: """ try: tokenizer: SomePretrainedTokenizer = transformers.AutoTokenizer.from_pretrained( - self.model_name_or_path, use_fast=True + self.model_name_or_path, use_fast=True, cache_dir=self.hfcache_dir ) except OSError: print( @@ -150,6 +157,9 @@ class FullPrecisionFineTuning: dataset_id=self.training_config.data_config.dataset_id, rows_in_page=-1 ) self.loaded_dataset = dataset.rows + cprint( + f"Dataset loaded! len: ({len(self.loaded_dataset)}), example row: ({self.loaded_dataset[0]})", color="cyan" + ) def preflight(self, set_status_callback: Callable[[JobStatus], None]): """Set of checks that should run before any heavier-weight preprocessing runs to validate starting state. @@ -175,19 +185,22 @@ class FullPrecisionFineTuning: RuntimeError: If tokenizer doesn't have chat template available. OSError: Can be raised via this function if config or tokenizer not available via model's name. """ - model_config = self.__try_load_config() + cprint("Loaded model config", color="cyan") if not self.check_model_arch_validated(model_config=model_config): # could raise Error if we need a strong check against this. print( f"Input model ({self.model_name_or_path}) architecture ({model_config.architectures}) is not among validated architectures." ) + cprint("Validated model config", color="cyan") model_tokenizer = self.__try_load_tokenizer() + cprint("Loaded model tokenizer", color="cyan") if not self.check_tokenizer_has_chat_template(model_tokenizer): raise RuntimeError( f"Input model ({self.model_name_or_path})'s tokenizer ({model_tokenizer.__name__}) has no chat template from associated `tokenizer_config.json`" ) + cprint("Validated model tokenizer", color="cyan") try: _ = model_tokenizer.apply_chat_template(self.loaded_dataset[0]["messages"]) @@ -198,6 +211,8 @@ class FullPrecisionFineTuning: ) raise + cprint("Model tokenizer applied template to row.", color="cyan") + # Success! Preflight checks haven't caught any immediate problems. set_status_callback(JobStatus.scheduled) @@ -221,7 +236,6 @@ class FullPrecisionFineTuning: Returns: dict[str, list[int]]: Of shape {input_ids, labels, attention_mask, loss_mask} """ - input_ids = tokenizer.apply_chat_template(conversation=sample, tokenize=True) input_ids = typing.cast( list[int], input_ids @@ -243,10 +257,17 @@ class FullPrecisionFineTuning: """ dataset = datasets.Dataset.from_list(self.loaded_dataset) + cprint(f"Dataset loaded. Example row: ({dataset[0]})", color="cyan") model_tok = self.__try_load_tokenizer() + cprint("Tokenizer loaded.", color="cyan") # NOTE: not implementing as batched for the moment; need to know how batching impacts memory usage on machine. - dataset = dataset.map(lambda x: self.__tokenize_and_generate_labels_and_mask(tokenizer=model_tok, sample=x)) + dataset = dataset.map( + lambda x: self.__tokenize_and_generate_labels_and_mask( + tokenizer=model_tok, sample=x["messages"] + ) # TODO: get this key from input dataset schema + ) + dataset = dataset.remove_columns(column_names=["messages"]) return dataset def setup(self): @@ -267,8 +288,8 @@ class FullPrecisionFineTuning: set_subproc_ref_callback (Callable[[subprocess.Process], None]): Sets subprocess reference in 'Impl' class' ref to this job """ - training_subproc = await asyncio.create_subprocess_exec( - "echo 'yay Im running in a subprocess: $$'; sleep 30; echo 'exiting process $$'" + training_subproc = await asyncio.create_subprocess_shell( + 'echo "yay Im running in a subprocess: $$"; sleep 5; echo "exiting subprocess $$"' ) set_subproc_ref_callback(training_subproc) await training_subproc.wait() diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index ca4dc59f7..2529f2c94 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -83,7 +83,7 @@ def available_providers() -> List[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="ollama", - pip_packages=["ollama", "aiohttp"], + pip_packages=["ollama", "aiohttp", "openai"], config_class="llama_stack.providers.remote.inference.ollama.OllamaImplConfig", module="llama_stack.providers.remote.inference.ollama", ), diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py index 4fd7122af..4e1ac4b3e 100644 --- a/llama_stack/providers/registry/post_training.py +++ b/llama_stack/providers/registry/post_training.py @@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.post_training, provider_type="inline::torchtune", - pip_packages=["torch", "torchtune==0.5.0", "torchao==0.8.0", "numpy"], + pip_packages=["torch", "torchtune==0.5.0", "torchao==0.8.0", "numpy", "openai"], module="llama_stack.providers.inline.post_training.torchtune", config_class="llama_stack.providers.inline.post_training.torchtune.TorchtunePostTrainingConfig", api_dependencies=[ @@ -25,7 +25,7 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.post_training, provider_type="inline::huggingface-ilab", - pip_packages=["torch", "transformers", "datasets", "numpy"], + pip_packages=["torch", "transformers", "datasets", "numpy", "openai"], module="llama_stack.providers.inline.post_training.huggingface_ilab", config_class="llama_stack.providers.inline.post_training.huggingface_ilab.HFilabPostTrainingConfig", api_dependencies=[ diff --git a/llama_stack/providers/utils/datasetio/url_utils.py b/llama_stack/providers/utils/datasetio/url_utils.py index 386ee736d..7fad0ae5a 100644 --- a/llama_stack/providers/utils/datasetio/url_utils.py +++ b/llama_stack/providers/utils/datasetio/url_utils.py @@ -4,7 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import asyncio import base64 import io from urllib.parse import unquote @@ -17,12 +16,11 @@ from llama_stack.providers.utils.memory.vector_store import parse_data_url async def get_dataframe_from_uri(uri: str): df = None if uri.endswith(".csv"): - # Moving to its own thread to avoid io from blocking the eventloop - # This isn't ideal as it moves more then just the IO to a new thread - # but it is as close as we can easly get - df = await asyncio.to_thread(pandas.read_csv, uri) + df = pandas.read_csv(uri) elif uri.endswith(".xlsx"): - df = await asyncio.to_thread(pandas.read_excel, uri) + df = pandas.read_excel(uri) + elif uri.endswith(".jsonl"): + df = pandas.read_json(uri, lines=True) elif uri.startswith("data:"): parts = parse_data_url(uri) data = parts["data"]