fix changes post merge

This commit is contained in:
raspawar 2025-03-21 18:09:17 +05:30
parent e95b1e9739
commit e4b39aacb8
5 changed files with 65 additions and 20 deletions

View file

@ -8,7 +8,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Protocol from typing import Any, Dict, List, Literal, Optional, Protocol
from pydantic import BaseModel, Field from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import Annotated from typing_extensions import Annotated
from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.content_types import URL
@ -71,6 +71,7 @@ class TrainingConfig(BaseModel):
@json_schema_type @json_schema_type
class LoraFinetuningConfig(BaseModel): class LoraFinetuningConfig(BaseModel):
model_config = ConfigDict(extra="allow")
type: Literal["LoRA"] = "LoRA" type: Literal["LoRA"] = "LoRA"
lora_attn_modules: List[str] lora_attn_modules: List[str]
apply_lora_to_mlp: bool apply_lora_to_mlp: bool

View file

@ -26,7 +26,7 @@ def available_providers() -> List[ProviderSpec]:
api=Api.post_training, api=Api.post_training,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=["requests"], pip_packages=["requests", "aiohttp"],
module="llama_stack.providers.remote.post_training.nvidia", module="llama_stack.providers.remote.post_training.nvidia",
config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig", config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig",
), ),

View file

@ -14,7 +14,7 @@ from llama_stack.apis.post_training import (
AlgorithmConfig, AlgorithmConfig,
DPOAlignmentConfig, DPOAlignmentConfig,
JobStatus, JobStatus,
PostTraining, LoraFinetuningConfig,
PostTrainingJob, PostTrainingJob,
PostTrainingJobArtifactsResponse, PostTrainingJobArtifactsResponse,
PostTrainingJobStatusResponse, PostTrainingJobStatusResponse,
@ -53,7 +53,11 @@ class ListNvidiaPostTrainingJobs(BaseModel):
data: List[NvidiaPostTrainingJob] data: List[NvidiaPostTrainingJob]
class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper): class NvidiaPostTrainingJobStatusResponse(PostTrainingJobStatusResponse):
model_config = ConfigDict(extra="allow")
class NvidiaPostTrainingAdapter(ModelRegistryHelper):
def __init__(self, config: NvidiaPostTrainingConfig): def __init__(self, config: NvidiaPostTrainingConfig):
self.config = config self.config = config
self.headers = {} self.headers = {}
@ -146,7 +150,7 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper):
return ListNvidiaPostTrainingJobs(data=jobs) return ListNvidiaPostTrainingJobs(data=jobs)
async def get_training_job_status(self, job_uuid: str) -> Optional[NvidiaPostTrainingJob]: async def get_training_job_status(self, job_uuid: str) -> NvidiaPostTrainingJobStatusResponse:
"""Get the status of a customization job. """Get the status of a customization job.
Updated the base class return type from PostTrainingJobResponse to NvidiaPostTrainingJob. Updated the base class return type from PostTrainingJobResponse to NvidiaPostTrainingJob.
@ -175,10 +179,10 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper):
api_status = response.pop("status").lower() api_status = response.pop("status").lower()
mapped_status = STATUS_MAPPING.get(api_status, "unknown") mapped_status = STATUS_MAPPING.get(api_status, "unknown")
return NvidiaPostTrainingJob( return NvidiaPostTrainingJobStatusResponse(
status=JobStatus(mapped_status), status=JobStatus(mapped_status),
job_uuid=job_uuid, job_uuid=job_uuid,
created_at=datetime.fromisoformat(response.pop("created_at")), started_at=datetime.fromisoformat(response.pop("created_at")),
updated_at=datetime.fromisoformat(response.pop("updated_at")), updated_at=datetime.fromisoformat(response.pop("updated_at")),
**response, **response,
) )
@ -188,10 +192,10 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper):
method="POST", path=f"/v1/customization/jobs/{job_uuid}/cancel", params={"job_id": job_uuid} method="POST", path=f"/v1/customization/jobs/{job_uuid}/cancel", params={"job_id": job_uuid}
) )
async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
raise NotImplementedError("Job artifacts are not implemented yet") raise NotImplementedError("Job artifacts are not implemented yet")
async def get_post_training_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: async def get_post_training_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
raise NotImplementedError("Job artifacts are not implemented yet") raise NotImplementedError("Job artifacts are not implemented yet")
async def supervised_fine_tune( async def supervised_fine_tune(
@ -389,14 +393,14 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper):
# Handle LoRA-specific configuration # Handle LoRA-specific configuration
if algorithm_config: if algorithm_config:
if isinstance(algorithm_config, dict) and algorithm_config.get("type") == "LoRA": if isinstance(algorithm_config, LoraFinetuningConfig) and algorithm_config.type == "LoRA":
warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config") warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config")
job_config["hyperparameters"]["lora"] = { job_config["hyperparameters"]["lora"] = {
k: v k: v
for k, v in { for k, v in {
"adapter_dim": algorithm_config.get("adapter_dim"), "adapter_dim": getattr(algorithm_config, "adapter_dim", None),
"alpha": algorithm_config.get("alpha"), "alpha": getattr(algorithm_config, "alpha", None),
"adapter_dropout": algorithm_config.get("adapter_dropout"), "adapter_dropout": getattr(algorithm_config, "adapter_dropout", None),
}.items() }.items()
if v is not None if v is not None
} }
@ -432,5 +436,5 @@ class NvidiaPostTrainingAdapter(PostTraining, ModelRegistryHelper):
"""Optimize a model based on preference data.""" """Optimize a model based on preference data."""
raise NotImplementedError("Preference optimization is not implemented yet") raise NotImplementedError("Preference optimization is not implemented yet")
async def get_training_job_container_logs(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: async def get_training_job_container_logs(self, job_uuid: str) -> PostTrainingJobStatusResponse:
raise NotImplementedError("Job logs are not implemented yet") raise NotImplementedError("Job logs are not implemented yet")

View file

@ -61,6 +61,11 @@ class TestNvidiaParameters(unittest.TestCase):
type="LoRA", type="LoRA",
adapter_dim=custom_adapter_dim, # Custom value adapter_dim=custom_adapter_dim, # Custom value
adapter_dropout=0.2, # Custom value adapter_dropout=0.2, # Custom value
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
rank=16,
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
) )
data_config = TrainingConfigDataConfig(dataset_id="test-dataset", batch_size=16) data_config = TrainingConfigDataConfig(dataset_id="test-dataset", batch_size=16)
@ -84,7 +89,7 @@ class TestNvidiaParameters(unittest.TestCase):
self._assert_request_params( self._assert_request_params(
{ {
"hyperparameters": { "hyperparameters": {
"lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2}, "lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2, "alpha": 16},
"epochs": 3, "epochs": 3,
"learning_rate": 0.0002, "learning_rate": 0.0002,
"batch_size": 16, "batch_size": 16,
@ -98,7 +103,16 @@ class TestNvidiaParameters(unittest.TestCase):
required_dataset_id = "required-dataset" required_dataset_id = "required-dataset"
required_job_uuid = "required-job" required_job_uuid = "required-job"
algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=8) algorithm_config = LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
rank=16,
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
data_config = TrainingConfigDataConfig( data_config = TrainingConfigDataConfig(
dataset_id=required_dataset_id, # Required parameter dataset_id=required_dataset_id, # Required parameter
@ -173,7 +187,16 @@ class TestNvidiaParameters(unittest.TestCase):
job_uuid="test-job", job_uuid="test-job",
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
checkpoint_dir="test-dir", # Unsupported parameter checkpoint_dir="test-dir", # Unsupported parameter
algorithm_config=LoraFinetuningConfig(type="LoRA"), algorithm_config=LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
rank=16,
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
),
training_config=training_config, training_config=training_config,
logger_config={"test": "value"}, # Unsupported parameter logger_config={"test": "value"}, # Unsupported parameter
hyperparam_search_config={"test": "value"}, # Unsupported parameter hyperparam_search_config={"test": "value"}, # Unsupported parameter

View file

@ -102,7 +102,16 @@ class TestNvidiaPostTraining(unittest.TestCase):
"ownership": {"created_by": "me", "access_policies": {}}, "ownership": {"created_by": "me", "access_policies": {}},
} }
algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16, adapter_dropout=0.1) algorithm_config = LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
rank=16,
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16) data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)
@ -147,7 +156,7 @@ class TestNvidiaPostTraining(unittest.TestCase):
"epochs": 2, "epochs": 2,
"batch_size": 16, "batch_size": 16,
"learning_rate": 0.0001, "learning_rate": 0.0001,
"lora": {"adapter_dim": 16, "adapter_dropout": 0.1}, "lora": {"alpha": 16, "adapter_dim": 16, "adapter_dropout": 0.1},
}, },
}, },
) )
@ -277,7 +286,15 @@ class TestNvidiaPostTraining(unittest.TestCase):
"output_model": "default/job-1234", "output_model": "default/job-1234",
} }
algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16, adapter_dropout=0.1) algorithm_config = LoraFinetuningConfig(
alpha=16,
rank=16,
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
)
data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16) data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)