fix: Correctly parse algorithm_config when launching NVIDIA customization job; fix internal request handler (#2025)

# What does this PR do?
This addresses 2 bugs I ran into when launching a fine-tuning job with
the NVIDIA Adapter:
1. Session handling in `_make_request` helper function returns an error.
```
INFO:     127.0.0.1:55831 - "POST /v1/post-training/supervised-fine-tune HTTP/1.1" 500 Internal Server Error
16:11:45.643 [END] /v1/post-training/supervised-fine-tune [StatusCode.OK] (270.44ms)
 16:11:45.643 [ERROR] Error executing endpoint route='/v1/post-training/supervised-fine-tune' method='post'
Traceback (most recent call last):
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/distribution/server/server.py", line 201, in endpoint
    return await maybe_await(value)
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/distribution/server/server.py", line 161, in maybe_await
    return await value
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/providers/remote/post_training/nvidia/post_training.py", line 408, in supervised_fine_tune
    response = await self._make_request(
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/providers/remote/post_training/nvidia/post_training.py", line 98, in _make_request
    async with self.session.request(method, url, params=params, json=json, **kwargs) as response:
  File "/Users/jgulabrai/Projects/forks/llama-stack/.venv/lib/python3.10/site-packages/aiohttp/client.py", line 1425, in __aenter__
    self._resp: _RetType = await self._coro
  File "/Users/jgulabrai/Projects/forks/llama-stack/.venv/lib/python3.10/site-packages/aiohttp/client.py", line 579, in _request
    handle = tm.start()
  File "/Users/jgulabrai/Projects/forks/llama-stack/.venv/lib/python3.10/site-packages/aiohttp/helpers.py", line 587, in start
    return self._loop.call_at(when, self.__call__)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 724, in call_at
    self._check_closed()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 510, in _check_closed
    raise RuntimeError('Event loop is closed')
RuntimeError: Event loop is closed
```
Note: This only occurred when initializing the client like so:
```
client = LlamaStackClient(
    base_url="http://0.0.0.0:8321"
)
response = client.post_training.supervised_fine_tune(...) # Returns error
```
I didn't run into this issue when using the library client:
```
client =  LlamaStackAsLibraryClient("nvidia")
client.initialize()
response = client.post_training.supervised_fine_tune(...) # Works fine
```

2. The `algorithm_config` param in `supervised_fine_tune` is parsed as a
`dict` when run from unit tests, but a Pydantic model when invoked using
the Llama Stack client. So, the call fails outside of unit tests:
```
INFO:     127.0.0.1:54024 - "POST /v1/post-training/supervised-fine-tune HTTP/1.1" 500 Internal Server Error
21:14:02.315 [END] /v1/post-training/supervised-fine-tune [StatusCode.OK] (71.18ms)
 21:14:02.314 [ERROR] Error executing endpoint route='/v1/post-training/supervised-fine-tune' method='post'
Traceback (most recent call last):
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/distribution/server/server.py", line 205, in endpoint
    return await maybe_await(value)
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/distribution/server/server.py", line 164, in maybe_await
    return await value
  File "/Users/jgulabrai/Projects/forks/llama-stack/llama_stack/providers/remote/post_training/nvidia/post_training.py", line 407, in supervised_fine_tune
    "adapter_dim": algorithm_config.get("adapter_dim"),
  File "/Users/jgulabrai/Projects/forks/llama-stack/.venv/lib/python3.10/site-packages/pydantic/main.py", line 891, in __getattr__
    raise AttributeError(f'{type(self).__name__!r} object has no attribute {item!r}')
AttributeError: 'LoraFinetuningConfig' object has no attribute 'get'
```
The code assumes `algorithm_config` should be `dict`, so I just handle
both cases.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
1. I ran a local Llama Stack server with the necessary env vars:
```
lama stack run llama_stack/templates/nvidia/run.yaml --port 8321 --env ...
```
And invoked `supervised_fine_tune` to confirm neither of the errors
above occur.
```
client = LlamaStackClient(
    base_url="http://0.0.0.0:8321"
)
response = client.post_training.supervised_fine_tune(...)
```
2. I confirmed the unit tests still pass: `./scripts/unit-tests.sh
tests/unit/providers/nvidia/test_supervised_fine_tuning.py`

[//]: # (## Documentation)

---------

Co-authored-by: Jash Gulabrai <jgulabrai@nvidia.com>
This commit is contained in:
Jash Gulabrai 2025-04-25 16:21:50 -04:00 committed by GitHub
parent b5d8e44e81
commit 8713d67ce3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 83 additions and 66 deletions

View file

@ -10,14 +10,17 @@ import warnings
from unittest.mock import patch
import pytest
from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
from llama_stack_client.types.post_training_supervised_fine_tune_params import (
TrainingConfig,
TrainingConfigDataConfig,
TrainingConfigEfficiencyConfig,
TrainingConfigOptimizerConfig,
)
from llama_stack.apis.post_training.post_training import (
DataConfig,
DatasetFormat,
EfficiencyConfig,
LoraFinetuningConfig,
OptimizerConfig,
OptimizerType,
TrainingConfig,
)
from llama_stack.distribution.library_client import convert_pydantic_to_json_value
from llama_stack.providers.remote.post_training.nvidia.post_training import (
NvidiaPostTrainingAdapter,
NvidiaPostTrainingConfig,
@ -66,11 +69,8 @@ class TestNvidiaParameters(unittest.TestCase):
def test_customizer_parameters_passed(self):
"""Test scenario 1: When an optional parameter is passed and value is correctly set."""
custom_adapter_dim = 32 # Different from default of 8
algorithm_config = LoraFinetuningConfig(
type="LoRA",
adapter_dim=custom_adapter_dim,
adapter_dropout=0.2,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
@ -78,8 +78,15 @@ class TestNvidiaParameters(unittest.TestCase):
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
data_config = TrainingConfigDataConfig(dataset_id="test-dataset", batch_size=16)
optimizer_config = TrainingConfigOptimizerConfig(lr=0.0002)
data_config = DataConfig(
dataset_id="test-dataset", batch_size=16, shuffle=False, data_format=DatasetFormat.instruct
)
optimizer_config = OptimizerConfig(
optimizer_type=OptimizerType.adam,
lr=0.0002,
weight_decay=0.01,
num_warmup_steps=100,
)
training_config = TrainingConfig(
n_epochs=3,
data_config=data_config,
@ -95,7 +102,7 @@ class TestNvidiaParameters(unittest.TestCase):
model="meta-llama/Llama-3.1-8B-Instruct",
checkpoint_dir="",
algorithm_config=algorithm_config,
training_config=training_config,
training_config=convert_pydantic_to_json_value(training_config),
logger_config={},
hyperparam_search_config={},
)
@ -114,7 +121,7 @@ class TestNvidiaParameters(unittest.TestCase):
self._assert_request_params(
{
"hyperparameters": {
"lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2, "alpha": 16},
"lora": {"alpha": 16},
"epochs": 3,
"learning_rate": 0.0002,
"batch_size": 16,
@ -130,8 +137,6 @@ class TestNvidiaParameters(unittest.TestCase):
algorithm_config = LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
@ -139,12 +144,16 @@ class TestNvidiaParameters(unittest.TestCase):
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
data_config = TrainingConfigDataConfig(
dataset_id=required_dataset_id, # Required parameter
batch_size=8,
data_config = DataConfig(
dataset_id=required_dataset_id, batch_size=8, shuffle=False, data_format=DatasetFormat.instruct
)
optimizer_config = TrainingConfigOptimizerConfig(lr=0.0001)
optimizer_config = OptimizerConfig(
optimizer_type=OptimizerType.adam,
lr=0.0001,
weight_decay=0.01,
num_warmup_steps=100,
)
training_config = TrainingConfig(
n_epochs=1,
@ -161,7 +170,7 @@ class TestNvidiaParameters(unittest.TestCase):
model=required_model, # Required parameter
checkpoint_dir="",
algorithm_config=algorithm_config,
training_config=training_config,
training_config=convert_pydantic_to_json_value(training_config),
logger_config={},
hyperparam_search_config={},
)
@ -186,24 +195,24 @@ class TestNvidiaParameters(unittest.TestCase):
def test_unsupported_parameters_warning(self):
"""Test that warnings are raised for unsupported parameters."""
data_config = TrainingConfigDataConfig(
data_config = DataConfig(
dataset_id="test-dataset",
batch_size=8,
# Unsupported parameters
shuffle=True,
data_format="instruct",
data_format=DatasetFormat.instruct,
validation_dataset_id="val-dataset",
)
optimizer_config = TrainingConfigOptimizerConfig(
optimizer_config = OptimizerConfig(
lr=0.0001,
weight_decay=0.01,
# Unsupported parameters
optimizer_type="adam",
optimizer_type=OptimizerType.adam,
num_warmup_steps=100,
)
efficiency_config = TrainingConfigEfficiencyConfig(
efficiency_config = EfficiencyConfig(
enable_activation_checkpointing=True # Unsupported parameter
)
@ -230,15 +239,13 @@ class TestNvidiaParameters(unittest.TestCase):
checkpoint_dir="test-dir", # Unsupported parameter
algorithm_config=LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
rank=16,
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
),
training_config=training_config,
training_config=convert_pydantic_to_json_value(training_config),
logger_config={"test": "value"}, # Unsupported parameter
hyperparam_search_config={"test": "value"}, # Unsupported parameter
)

View file

@ -10,14 +10,18 @@ import warnings
from unittest.mock import patch
import pytest
from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig, QatFinetuningConfig
from llama_stack_client.types.post_training_supervised_fine_tune_params import (
TrainingConfig,
TrainingConfigDataConfig,
TrainingConfigOptimizerConfig,
)
from llama_stack.apis.models import Model, ModelType
from llama_stack.apis.post_training.post_training import (
DataConfig,
DatasetFormat,
LoraFinetuningConfig,
OptimizerConfig,
OptimizerType,
QATFinetuningConfig,
TrainingConfig,
)
from llama_stack.distribution.library_client import convert_pydantic_to_json_value
from llama_stack.providers.remote.inference.nvidia.nvidia import NVIDIAConfig, NVIDIAInferenceAdapter
from llama_stack.providers.remote.post_training.nvidia.post_training import (
ListNvidiaPostTrainingJobs,
@ -121,7 +125,7 @@ class TestNvidiaPostTraining(unittest.TestCase):
"batch_size": 16,
"epochs": 2,
"learning_rate": 0.0001,
"lora": {"adapter_dim": 16, "adapter_dropout": 0.1},
"lora": {"alpha": 16},
},
"output_model": "default/job-1234",
"status": "created",
@ -132,8 +136,6 @@ class TestNvidiaPostTraining(unittest.TestCase):
algorithm_config = LoraFinetuningConfig(
type="LoRA",
adapter_dim=16,
adapter_dropout=0.1,
apply_lora_to_mlp=True,
apply_lora_to_output=True,
alpha=16,
@ -141,10 +143,15 @@ class TestNvidiaPostTraining(unittest.TestCase):
lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)
data_config = DataConfig(
dataset_id="sample-basic-test", batch_size=16, shuffle=False, data_format=DatasetFormat.instruct
)
optimizer_config = TrainingConfigOptimizerConfig(
optimizer_config = OptimizerConfig(
optimizer_type=OptimizerType.adam,
lr=0.0001,
weight_decay=0.01,
num_warmup_steps=100,
)
training_config = TrainingConfig(
@ -161,7 +168,7 @@ class TestNvidiaPostTraining(unittest.TestCase):
model="meta-llama/Llama-3.1-8B-Instruct",
checkpoint_dir="",
algorithm_config=algorithm_config,
training_config=training_config,
training_config=convert_pydantic_to_json_value(training_config),
logger_config={},
hyperparam_search_config={},
)
@ -185,16 +192,22 @@ class TestNvidiaPostTraining(unittest.TestCase):
"epochs": 2,
"batch_size": 16,
"learning_rate": 0.0001,
"lora": {"alpha": 16, "adapter_dim": 16, "adapter_dropout": 0.1},
"weight_decay": 0.01,
"lora": {"alpha": 16},
},
},
)
def test_supervised_fine_tune_with_qat(self):
algorithm_config = QatFinetuningConfig(type="QAT", quantizer_name="quantizer_name", group_size=1)
data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)
optimizer_config = TrainingConfigOptimizerConfig(
algorithm_config = QATFinetuningConfig(type="QAT", quantizer_name="quantizer_name", group_size=1)
data_config = DataConfig(
dataset_id="sample-basic-test", batch_size=16, shuffle=False, data_format=DatasetFormat.instruct
)
optimizer_config = OptimizerConfig(
optimizer_type=OptimizerType.adam,
lr=0.0001,
weight_decay=0.01,
num_warmup_steps=100,
)
training_config = TrainingConfig(
n_epochs=2,
@ -209,7 +222,7 @@ class TestNvidiaPostTraining(unittest.TestCase):
model="meta-llama/Llama-3.1-8B-Instruct",
checkpoint_dir="",
algorithm_config=algorithm_config,
training_config=training_config,
training_config=convert_pydantic_to_json_value(training_config),
logger_config={},
hyperparam_search_config={},
)