Merge branch 'main' into nvidia-e2e-notebook

This commit is contained in:
Jash Gulabrai 2025-04-28 12:52:46 -04:00
commit c3d8940c95
8 changed files with 214 additions and 11 deletions

View file

@ -439,8 +439,8 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
# infer provider from source
if metadata:
if metadata.get("provider"):
provider_id = metadata.get("provider") # pass through from nvidia datasetio
if metadata.get("provider_id"):
provider_id = metadata.get("provider_id") # pass through from nvidia datasetio
elif source.type == DatasetType.rows.value:
provider_id = "localfs"
elif source.type == DatasetType.uri.value:

View file

@ -44,8 +44,7 @@ class NvidiaDatasetIOAdapter:
request_headers.update(headers)
async with aiohttp.ClientSession(headers=request_headers) as session:
# TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
async with session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response:
async with session.request(method, url, params=params, json=json, **kwargs) as response:
if response.status != 200:
error_data = await response.json()
raise Exception(f"API request failed: {error_data}")

View file

@ -7,6 +7,7 @@
from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
@ -38,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::nvidia",
config=NVIDIASafetyConfig.sample_run_config(),
)
datasetio_provider = Provider(
provider_id="nvidia",
provider_type="remote::nvidia",
config=NvidiaDatasetIOConfig.sample_run_config(),
)
eval_provider = Provider(
provider_id="nvidia",
provider_type="remote::nvidia",
@ -75,6 +81,7 @@ def get_distribution_template() -> DistributionTemplate:
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
"datasetio": [datasetio_provider],
"eval": [eval_provider],
},
default_models=default_models,

View file

@ -62,13 +62,6 @@ providers:
project_id: ${env.NVIDIA_PROJECT_ID:test-project}
customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:http://nemo.test}
datasetio:
- provider_id: localfs
provider_type: inline::localfs
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db
- provider_id: nvidia
provider_type: remote::nvidia
config:

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,14 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import pytest
# Skip all tests in this directory when running in GitHub Actions
in_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
if in_github_actions:
pytest.skip("Skipping NVIDIA tests in GitHub Actions environment", allow_module_level=True)

View file

@ -0,0 +1,47 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
# How to run this test:
#
# LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py
# nvidia provider only
@pytest.mark.parametrize(
"provider_id",
[
"nvidia",
],
)
def test_register_and_unregister(llama_stack_client, provider_id):
purpose = "eval/messages-answer"
source = {
"type": "uri",
"uri": "hf://datasets/llamastack/simpleqa?split=train",
}
dataset_id = f"test-dataset-{provider_id}"
dataset = llama_stack_client.datasets.register(
dataset_id=dataset_id,
purpose=purpose,
source=source,
metadata={"provider_id": provider_id, "format": "json", "description": "Test dataset description"},
)
assert dataset.identifier is not None
assert dataset.provider_id == provider_id
assert dataset.identifier == dataset_id
dataset_list = llama_stack_client.datasets.list()
provider_datasets = [d for d in dataset_list if d.provider_id == provider_id]
assert any(provider_datasets)
assert any(d.identifier == dataset_id for d in provider_datasets)
llama_stack_client.datasets.unregister(dataset.identifier)
dataset_list = llama_stack_client.datasets.list()
provider_datasets = [d for d in dataset_list if d.identifier == dataset.identifier]
assert not any(provider_datasets)

View file

@ -0,0 +1,138 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import unittest
from unittest.mock import patch
import pytest
from llama_stack.apis.datasets import Dataset, DatasetPurpose, URIDataSource
from llama_stack.providers.remote.datasetio.nvidia.config import NvidiaDatasetIOConfig
from llama_stack.providers.remote.datasetio.nvidia.datasetio import NvidiaDatasetIOAdapter
class TestNvidiaDatastore(unittest.TestCase):
def setUp(self):
os.environ["NVIDIA_DATASETS_URL"] = "http://nemo.test/datasets"
config = NvidiaDatasetIOConfig(
datasets_url=os.environ["NVIDIA_DATASETS_URL"], dataset_namespace="default", project_id="default"
)
self.adapter = NvidiaDatasetIOAdapter(config)
self.make_request_patcher = patch(
"llama_stack.providers.remote.datasetio.nvidia.datasetio.NvidiaDatasetIOAdapter._make_request"
)
self.mock_make_request = self.make_request_patcher.start()
def tearDown(self):
self.make_request_patcher.stop()
@pytest.fixture(autouse=True)
def inject_fixtures(self, run_async):
self.run_async = run_async
def _assert_request(self, mock_call, expected_method, expected_path, expected_json=None):
"""Helper method to verify request details in mock calls."""
call_args = mock_call.call_args
assert call_args[0][0] == expected_method
assert call_args[0][1] == expected_path
if expected_json:
for key, value in expected_json.items():
assert call_args[1]["json"][key] == value
def test_register_dataset(self):
self.mock_make_request.return_value = {
"id": "dataset-123456",
"name": "test-dataset",
"namespace": "default",
}
dataset_def = Dataset(
identifier="test-dataset",
type="dataset",
provider_resource_id="",
provider_id="",
purpose=DatasetPurpose.post_training_messages,
source=URIDataSource(uri="https://example.com/data.jsonl"),
metadata={"provider_id": "nvidia", "format": "jsonl", "description": "Test dataset description"},
)
self.run_async(self.adapter.register_dataset(dataset_def))
self.mock_make_request.assert_called_once()
self._assert_request(
self.mock_make_request,
"POST",
"/v1/datasets",
expected_json={
"name": "test-dataset",
"namespace": "default",
"files_url": "https://example.com/data.jsonl",
"project": "default",
"format": "jsonl",
"description": "Test dataset description",
},
)
def test_unregister_dataset(self):
self.mock_make_request.return_value = {
"message": "Resource deleted successfully.",
"id": "dataset-81RSQp7FKX3rdBtKvF9Skn",
"deleted_at": None,
}
dataset_id = "test-dataset"
self.run_async(self.adapter.unregister_dataset(dataset_id))
self.mock_make_request.assert_called_once()
self._assert_request(self.mock_make_request, "DELETE", "/v1/datasets/default/test-dataset")
def test_register_dataset_with_custom_namespace_project(self):
custom_config = NvidiaDatasetIOConfig(
datasets_url=os.environ["NVIDIA_DATASETS_URL"],
dataset_namespace="custom-namespace",
project_id="custom-project",
)
custom_adapter = NvidiaDatasetIOAdapter(custom_config)
self.mock_make_request.return_value = {
"id": "dataset-123456",
"name": "test-dataset",
"namespace": "custom-namespace",
}
dataset_def = Dataset(
identifier="test-dataset",
type="dataset",
provider_resource_id="",
provider_id="",
purpose=DatasetPurpose.post_training_messages,
source=URIDataSource(uri="https://example.com/data.jsonl"),
metadata={"format": "jsonl"},
)
self.run_async(custom_adapter.register_dataset(dataset_def))
self.mock_make_request.assert_called_once()
self._assert_request(
self.mock_make_request,
"POST",
"/v1/datasets",
expected_json={
"name": "test-dataset",
"namespace": "custom-namespace",
"files_url": "https://example.com/data.jsonl",
"project": "custom-project",
"format": "jsonl",
},
)
if __name__ == "__main__":
unittest.main()