fix: Handle case when Customizer Job status is unknown (#1965)

# What does this PR do?
This PR handles the case where a Customization Job's status is
`unknown`. Since we don't map `unknown` to a valid `JobStatus`, the
PostTraining provider throws an exception when fetching/listing a job.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]
`./scripts/unit-tests.sh
tests/unit/providers/nvidia/test_supervised_fine_tuning.py` succeeds

[//]: # (## Documentation)

Co-authored-by: Jash Gulabrai <jgulabrai@nvidia.com>
This commit is contained in:
Jash Gulabrai 2025-04-17 04:27:07 -04:00 committed by GitHub
parent 6f97f9a593
commit 45e08ff417
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 44 additions and 30 deletions

View file

@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES
# Map API status to JobStatus enum # Map API status to JobStatus enum
STATUS_MAPPING = { STATUS_MAPPING = {
"running": "in_progress", "running": JobStatus.in_progress.value,
"completed": "completed", "completed": JobStatus.completed.value,
"failed": "failed", "failed": JobStatus.failed.value,
"cancelled": "cancelled", "cancelled": JobStatus.cancelled.value,
"pending": "scheduled", "pending": JobStatus.scheduled.value,
"unknown": JobStatus.scheduled.value,
} }

View file

@ -200,35 +200,48 @@ class TestNvidiaPostTraining(unittest.TestCase):
) )
def test_get_training_job_status(self): def test_get_training_job_status(self):
self.mock_make_request.return_value = { customizer_status_to_job_status = [
"created_at": "2024-12-09T04:06:28.580220", ("running", "in_progress"),
"updated_at": "2024-12-09T04:21:19.852832", ("completed", "completed"),
"status": "completed", ("failed", "failed"),
"steps_completed": 1210, ("cancelled", "cancelled"),
"epochs_completed": 2, ("pending", "scheduled"),
"percentage_done": 100.0, ("unknown", "scheduled"),
"best_epoch": 2, ]
"train_loss": 1.718016266822815,
"val_loss": 1.8661999702453613,
}
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2" for customizer_status, expected_status in customizer_status_to_job_status:
with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
self.mock_make_request.return_value = {
"created_at": "2024-12-09T04:06:28.580220",
"updated_at": "2024-12-09T04:21:19.852832",
"status": customizer_status,
"steps_completed": 1210,
"epochs_completed": 2,
"percentage_done": 100.0,
"best_epoch": 2,
"train_loss": 1.718016266822815,
"val_loss": 1.8661999702453613,
}
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id)) job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
assert isinstance(status, NvidiaPostTrainingJobStatusResponse) status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
assert status.status.value == "completed"
assert status.steps_completed == 1210
assert status.epochs_completed == 2
assert status.percentage_done == 100.0
assert status.best_epoch == 2
assert status.train_loss == 1.718016266822815
assert status.val_loss == 1.8661999702453613
self.mock_make_request.assert_called_once() assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
self._assert_request( assert status.status.value == expected_status
self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id} assert status.steps_completed == 1210
) assert status.epochs_completed == 2
assert status.percentage_done == 100.0
assert status.best_epoch == 2
assert status.train_loss == 1.718016266822815
assert status.val_loss == 1.8661999702453613
self._assert_request(
self.mock_make_request,
"GET",
f"/v1/customization/jobs/{job_id}/status",
expected_params={"job_id": job_id},
)
def test_get_training_jobs(self): def test_get_training_jobs(self):
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2" job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"