diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index e61c0e4e4..48dad5cd2 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -8,7 +8,7 @@ from datetime import datetime from enum import Enum from typing import Any, Dict, List, Literal, Optional, Protocol -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from typing_extensions import Annotated from llama_stack.apis.common.content_types import URL @@ -71,6 +71,7 @@ class TrainingConfig(BaseModel): @json_schema_type class LoraFinetuningConfig(BaseModel): + model_config = ConfigDict(extra="allow") type: Literal["LoRA"] = "LoRA" lora_attn_modules: List[str] apply_lora_to_mlp: bool diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py index b4b063144..4d10fcf3b 100644 --- a/llama_stack/providers/registry/post_training.py +++ b/llama_stack/providers/registry/post_training.py @@ -26,7 +26,7 @@ def available_providers() -> List[ProviderSpec]: api=Api.post_training, adapter=AdapterSpec( adapter_type="nvidia", - pip_packages=["requests"], + pip_packages=["requests", "aiohttp"], module="llama_stack.providers.remote.post_training.nvidia", config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig", ), diff --git a/llama_stack/providers/remote/post_training/nvidia/post_training.py b/llama_stack/providers/remote/post_training/nvidia/post_training.py index b1221335e..f8766cdbb 100644 --- a/llama_stack/providers/remote/post_training/nvidia/post_training.py +++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py @@ -14,7 +14,7 @@ from llama_stack.apis.post_training import ( AlgorithmConfig, DPOAlignmentConfig, JobStatus, - PostTraining, + LoraFinetuningConfig, PostTrainingJob, PostTrainingJobArtifactsResponse, PostTrainingJobStatusResponse, @@ -53,7 +53,11 @@ class ListNvidiaPostTrainingJobs(BaseModel): data: List[NvidiaPostTrainingJob] -class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): +class NvidiaPostTrainingJobStatusResponse(PostTrainingJobStatusResponse): + model_config = ConfigDict(extra="allow") + + +class NvidiaPostTrainingAdapter(ModelRegistryHelper): def __init__(self, config: NvidiaPostTrainingConfig): self.config = config self.headers = {} @@ -146,7 +150,7 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): return ListNvidiaPostTrainingJobs(data=jobs) - async def get_training_job_status(self, job_uuid: str) -> Optional[NvidiaPostTrainingJob]: + async def get_training_job_status(self, job_uuid: str) -> NvidiaPostTrainingJobStatusResponse: """Get the status of a customization job. Updated the base class return type from PostTrainingJobResponse to NvidiaPostTrainingJob. @@ -175,10 +179,10 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): api_status = response.pop("status").lower() mapped_status = STATUS_MAPPING.get(api_status, "unknown") - return NvidiaPostTrainingJob( + return NvidiaPostTrainingJobStatusResponse( status=JobStatus(mapped_status), job_uuid=job_uuid, - created_at=datetime.fromisoformat(response.pop("created_at")), + started_at=datetime.fromisoformat(response.pop("created_at")), updated_at=datetime.fromisoformat(response.pop("updated_at")), **response, ) @@ -188,10 +192,10 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): method="POST", path=f"/v1/customization/jobs/{job_uuid}/cancel", params={"job_id": job_uuid} ) - async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: + async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: raise NotImplementedError("Job artifacts are not implemented yet") - async def get_post_training_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: + async def get_post_training_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: raise NotImplementedError("Job artifacts are not implemented yet") async def supervised_fine_tune( @@ -389,14 +393,14 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): # Handle LoRA-specific configuration if algorithm_config: - if isinstance(algorithm_config, dict) and algorithm_config.get("type") == "LoRA": + if isinstance(algorithm_config, LoraFinetuningConfig) and algorithm_config.type == "LoRA": warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config") job_config["hyperparameters"]["lora"] = { k: v for k, v in { - "adapter_dim": algorithm_config.get("adapter_dim"), - "alpha": algorithm_config.get("alpha"), - "adapter_dropout": algorithm_config.get("adapter_dropout"), + "adapter_dim": getattr(algorithm_config, "adapter_dim", None), + "alpha": getattr(algorithm_config, "alpha", None), + "adapter_dropout": getattr(algorithm_config, "adapter_dropout", None), }.items() if v is not None } @@ -432,5 +436,5 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): """Optimize a model based on preference data.""" raise NotImplementedError("Preference optimization is not implemented yet") - async def get_training_job_container_logs(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: + async def get_training_job_container_logs(self, job_uuid: str) -> PostTrainingJobStatusResponse: raise NotImplementedError("Job logs are not implemented yet") diff --git a/tests/unit/providers/nvidia/test_parameters.py b/tests/unit/providers/nvidia/test_parameters.py index db95a03c0..2cd5a42e2 100644 --- a/tests/unit/providers/nvidia/test_parameters.py +++ b/tests/unit/providers/nvidia/test_parameters.py @@ -61,6 +61,11 @@ class TestNvidiaParameters(unittest.TestCase): type="LoRA", adapter_dim=custom_adapter_dim, # Custom value adapter_dropout=0.2, # Custom value + apply_lora_to_mlp=True, + apply_lora_to_output=True, + alpha=16, + rank=16, + lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"], ) data_config = TrainingConfigDataConfig(dataset_id="test-dataset", batch_size=16) @@ -84,7 +89,7 @@ class TestNvidiaParameters(unittest.TestCase): self._assert_request_params( { "hyperparameters": { - "lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2}, + "lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2, "alpha": 16}, "epochs": 3, "learning_rate": 0.0002, "batch_size": 16, @@ -98,7 +103,16 @@ class TestNvidiaParameters(unittest.TestCase): required_dataset_id = "required-dataset" required_job_uuid = "required-job" - algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=8) + algorithm_config = LoraFinetuningConfig( + type="LoRA", + adapter_dim=16, + adapter_dropout=0.1, + apply_lora_to_mlp=True, + apply_lora_to_output=True, + alpha=16, + rank=16, + lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + ) data_config = TrainingConfigDataConfig( dataset_id=required_dataset_id, # Required parameter @@ -173,7 +187,16 @@ class TestNvidiaParameters(unittest.TestCase): job_uuid="test-job", model="meta-llama/Llama-3.1-8B-Instruct", checkpoint_dir="test-dir", # Unsupported parameter - algorithm_config=LoraFinetuningConfig(type="LoRA"), + algorithm_config=LoraFinetuningConfig( + type="LoRA", + adapter_dim=16, + adapter_dropout=0.1, + apply_lora_to_mlp=True, + apply_lora_to_output=True, + alpha=16, + rank=16, + lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + ), training_config=training_config, logger_config={"test": "value"}, # Unsupported parameter hyperparam_search_config={"test": "value"}, # Unsupported parameter diff --git a/tests/unit/providers/nvidia/test_supervised_fine_tuning.py b/tests/unit/providers/nvidia/test_supervised_fine_tuning.py index dfdca39d1..ecb1cd0b8 100644 --- a/tests/unit/providers/nvidia/test_supervised_fine_tuning.py +++ b/tests/unit/providers/nvidia/test_supervised_fine_tuning.py @@ -102,7 +102,16 @@ class TestNvidiaPostTraining(unittest.TestCase): "ownership": {"created_by": "me", "access_policies": {}}, } - algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16, adapter_dropout=0.1) + algorithm_config = LoraFinetuningConfig( + type="LoRA", + adapter_dim=16, + adapter_dropout=0.1, + apply_lora_to_mlp=True, + apply_lora_to_output=True, + alpha=16, + rank=16, + lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + ) data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16) @@ -147,7 +156,7 @@ class TestNvidiaPostTraining(unittest.TestCase): "epochs": 2, "batch_size": 16, "learning_rate": 0.0001, - "lora": {"adapter_dim": 16, "adapter_dropout": 0.1}, + "lora": {"alpha": 16, "adapter_dim": 16, "adapter_dropout": 0.1}, }, }, ) @@ -277,7 +286,15 @@ class TestNvidiaPostTraining(unittest.TestCase): "output_model": "default/job-1234", } - algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16, adapter_dropout=0.1) + algorithm_config = LoraFinetuningConfig( + alpha=16, + rank=16, + type="LoRA", + adapter_dim=16, + adapter_dropout=0.1, + apply_lora_to_mlp=True, + apply_lora_to_output=True, + ) data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)