diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index fe52a0abb..68ee837bf 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -439,8 +439,8 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): # infer provider from source if metadata: - if metadata.get("provider"): - provider_id = metadata.get("provider") # pass through from nvidia datasetio + if metadata.get("provider_id"): + provider_id = metadata.get("provider_id") # pass through from nvidia datasetio elif source.type == DatasetType.rows.value: provider_id = "localfs" elif source.type == DatasetType.uri.value: diff --git a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py index 9eb17be71..8861436cb 100644 --- a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py +++ b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py @@ -44,8 +44,7 @@ class NvidiaDatasetIOAdapter: request_headers.update(headers) async with aiohttp.ClientSession(headers=request_headers) as session: - # TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/` - async with session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response: + async with session.request(method, url, params=params, json=json, **kwargs) as response: if response.status != 200: error_data = await response.json() raise Exception(f"API request failed: {error_data}") diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 47a0c04fa..4eccfb25c 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput +from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES @@ -38,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::nvidia", config=NVIDIASafetyConfig.sample_run_config(), ) + datasetio_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NvidiaDatasetIOConfig.sample_run_config(), + ) eval_provider = Provider( provider_id="nvidia", provider_type="remote::nvidia", @@ -75,6 +81,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "datasetio": [datasetio_provider], "eval": [eval_provider], }, default_models=default_models, diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index bc527d4cb..210f17b3e 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -62,13 +62,6 @@ providers: project_id: ${env.NVIDIA_PROJECT_ID:test-project} customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:http://nemo.test} datasetio: - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db - provider_id: nvidia provider_type: remote::nvidia config: diff --git a/tests/integration/providers/nvidia/__init__.py b/tests/integration/providers/nvidia/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/integration/providers/nvidia/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/integration/providers/nvidia/conftest.py b/tests/integration/providers/nvidia/conftest.py new file mode 100644 index 000000000..8beb113b0 --- /dev/null +++ b/tests/integration/providers/nvidia/conftest.py @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os + +import pytest + +# Skip all tests in this directory when running in GitHub Actions +in_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" +if in_github_actions: + pytest.skip("Skipping NVIDIA tests in GitHub Actions environment", allow_module_level=True) diff --git a/tests/integration/providers/nvidia/test_datastore.py b/tests/integration/providers/nvidia/test_datastore.py new file mode 100644 index 000000000..5f96dee9f --- /dev/null +++ b/tests/integration/providers/nvidia/test_datastore.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +import pytest + +# How to run this test: +# +# LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py + + +# nvidia provider only +@pytest.mark.parametrize( + "provider_id", + [ + "nvidia", + ], +) +def test_register_and_unregister(llama_stack_client, provider_id): + purpose = "eval/messages-answer" + source = { + "type": "uri", + "uri": "hf://datasets/llamastack/simpleqa?split=train", + } + dataset_id = f"test-dataset-{provider_id}" + dataset = llama_stack_client.datasets.register( + dataset_id=dataset_id, + purpose=purpose, + source=source, + metadata={"provider_id": provider_id, "format": "json", "description": "Test dataset description"}, + ) + assert dataset.identifier is not None + assert dataset.provider_id == provider_id + assert dataset.identifier == dataset_id + + dataset_list = llama_stack_client.datasets.list() + provider_datasets = [d for d in dataset_list if d.provider_id == provider_id] + assert any(provider_datasets) + assert any(d.identifier == dataset_id for d in provider_datasets) + + llama_stack_client.datasets.unregister(dataset.identifier) + dataset_list = llama_stack_client.datasets.list() + provider_datasets = [d for d in dataset_list if d.identifier == dataset.identifier] + assert not any(provider_datasets) diff --git a/tests/unit/providers/nvidia/test_datastore.py b/tests/unit/providers/nvidia/test_datastore.py new file mode 100644 index 000000000..a17e51a9c --- /dev/null +++ b/tests/unit/providers/nvidia/test_datastore.py @@ -0,0 +1,138 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os +import unittest +from unittest.mock import patch + +import pytest + +from llama_stack.apis.datasets import Dataset, DatasetPurpose, URIDataSource +from llama_stack.providers.remote.datasetio.nvidia.config import NvidiaDatasetIOConfig +from llama_stack.providers.remote.datasetio.nvidia.datasetio import NvidiaDatasetIOAdapter + + +class TestNvidiaDatastore(unittest.TestCase): + def setUp(self): + os.environ["NVIDIA_DATASETS_URL"] = "http://nemo.test/datasets" + + config = NvidiaDatasetIOConfig( + datasets_url=os.environ["NVIDIA_DATASETS_URL"], dataset_namespace="default", project_id="default" + ) + self.adapter = NvidiaDatasetIOAdapter(config) + self.make_request_patcher = patch( + "llama_stack.providers.remote.datasetio.nvidia.datasetio.NvidiaDatasetIOAdapter._make_request" + ) + self.mock_make_request = self.make_request_patcher.start() + + def tearDown(self): + self.make_request_patcher.stop() + + @pytest.fixture(autouse=True) + def inject_fixtures(self, run_async): + self.run_async = run_async + + def _assert_request(self, mock_call, expected_method, expected_path, expected_json=None): + """Helper method to verify request details in mock calls.""" + call_args = mock_call.call_args + + assert call_args[0][0] == expected_method + assert call_args[0][1] == expected_path + + if expected_json: + for key, value in expected_json.items(): + assert call_args[1]["json"][key] == value + + def test_register_dataset(self): + self.mock_make_request.return_value = { + "id": "dataset-123456", + "name": "test-dataset", + "namespace": "default", + } + + dataset_def = Dataset( + identifier="test-dataset", + type="dataset", + provider_resource_id="", + provider_id="", + purpose=DatasetPurpose.post_training_messages, + source=URIDataSource(uri="https://example.com/data.jsonl"), + metadata={"provider_id": "nvidia", "format": "jsonl", "description": "Test dataset description"}, + ) + + self.run_async(self.adapter.register_dataset(dataset_def)) + + self.mock_make_request.assert_called_once() + self._assert_request( + self.mock_make_request, + "POST", + "/v1/datasets", + expected_json={ + "name": "test-dataset", + "namespace": "default", + "files_url": "https://example.com/data.jsonl", + "project": "default", + "format": "jsonl", + "description": "Test dataset description", + }, + ) + + def test_unregister_dataset(self): + self.mock_make_request.return_value = { + "message": "Resource deleted successfully.", + "id": "dataset-81RSQp7FKX3rdBtKvF9Skn", + "deleted_at": None, + } + dataset_id = "test-dataset" + + self.run_async(self.adapter.unregister_dataset(dataset_id)) + + self.mock_make_request.assert_called_once() + self._assert_request(self.mock_make_request, "DELETE", "/v1/datasets/default/test-dataset") + + def test_register_dataset_with_custom_namespace_project(self): + custom_config = NvidiaDatasetIOConfig( + datasets_url=os.environ["NVIDIA_DATASETS_URL"], + dataset_namespace="custom-namespace", + project_id="custom-project", + ) + custom_adapter = NvidiaDatasetIOAdapter(custom_config) + + self.mock_make_request.return_value = { + "id": "dataset-123456", + "name": "test-dataset", + "namespace": "custom-namespace", + } + + dataset_def = Dataset( + identifier="test-dataset", + type="dataset", + provider_resource_id="", + provider_id="", + purpose=DatasetPurpose.post_training_messages, + source=URIDataSource(uri="https://example.com/data.jsonl"), + metadata={"format": "jsonl"}, + ) + + self.run_async(custom_adapter.register_dataset(dataset_def)) + + self.mock_make_request.assert_called_once() + self._assert_request( + self.mock_make_request, + "POST", + "/v1/datasets", + expected_json={ + "name": "test-dataset", + "namespace": "custom-namespace", + "files_url": "https://example.com/data.jsonl", + "project": "custom-project", + "format": "jsonl", + }, + ) + + +if __name__ == "__main__": + unittest.main()