From 1e77873a027fac016faf8cfe804c774d12c35d50 Mon Sep 17 00:00:00 2001 From: raspawar Date: Wed, 26 Mar 2025 15:21:38 +0530 Subject: [PATCH] add datasetio to distribution --- .../remote/datasetio/nvidia/config.py | 10 ------ .../remote/datasetio/nvidia/datasetio.py | 31 ++++++++----------- llama_stack/templates/dependencies.json | 2 ++ llama_stack/templates/nvidia/build.yaml | 1 + llama_stack/templates/nvidia/nvidia.py | 9 +++++- .../templates/nvidia/run-with-safety.yaml | 7 +++++ llama_stack/templates/nvidia/run.yaml | 12 +++---- 7 files changed, 37 insertions(+), 35 deletions(-) diff --git a/llama_stack/providers/remote/datasetio/nvidia/config.py b/llama_stack/providers/remote/datasetio/nvidia/config.py index 46aa68e5f..f80c6bb20 100644 --- a/llama_stack/providers/remote/datasetio/nvidia/config.py +++ b/llama_stack/providers/remote/datasetio/nvidia/config.py @@ -24,11 +24,6 @@ class NvidiaDatasetIOConfig(BaseModel): description="The NVIDIA dataset namespace.", ) - access_policies: Optional[dict] = Field( - default_factory=lambda: os.getenv("NVIDIA_ACCESS_POLICIES", {}), - description="The NVIDIA access policies.", - ) - project_id: Optional[str] = Field( default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"), description="The NVIDIA project ID.", @@ -46,8 +41,6 @@ class NvidiaDatasetIOConfig(BaseModel): default_values.append("project_id='test-project'") if os.getenv("NVIDIA_DATASET_NAMESPACE") is None: default_values.append("dataset_namespace='default'") - if os.getenv("NVIDIA_ACCESS_POLICIES") is None: - default_values.append("access_policies='{}'") if os.getenv("NVIDIA_DATASETS_URL") is None: default_values.append("datasets_url='http://nemo.test'") @@ -64,8 +57,5 @@ class NvidiaDatasetIOConfig(BaseModel): "api_key": "${env.NVIDIA_API_KEY:}", "user_id": "${env.NVIDIA_USER_ID:llama-stack-user}", "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}", - "access_policies": "${env.NVIDIA_ACCESS_POLICIES:}", "project_id": "${env.NVIDIA_PROJECT_ID:test-project}", - "customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:}", - "output_model_dir": "${env.NVIDIA_OUTPUT_MODEL_DIR:test-example-model@v1}", } diff --git a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py index 9a5c8e46b..95bd155a8 100644 --- a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py +++ b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py @@ -4,14 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, Dict, Literal, Optional +from typing import Any, Dict, List, Optional import aiohttp from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ParamType -from llama_stack.apis.datasets.datasets import Dataset, ListDatasetsResponse -from llama_stack.apis.resource import ResourceType +from llama_stack.apis.datasetio import IterrowsResponse from llama_stack.schema_utils import webmethod from .config import NvidiaDatasetIOConfig @@ -20,8 +19,6 @@ from .config import NvidiaDatasetIOConfig class NvidiaDatasetIOAdapter: """Nvidia NeMo DatasetIO API.""" - type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value - def __init__(self, config: NvidiaDatasetIOConfig): self.config = config self.headers = {} @@ -80,19 +77,6 @@ class NvidiaDatasetIOAdapter: """ ... - @webmethod(route="/datasets/{dataset_id:namespace}", method="GET") - async def get_dataset( - self, - dataset_id: str, - ) -> Optional[Dataset]: - raise NotImplementedError("Not implemented") - - @webmethod(route="/datasets", method="GET") - async def list_datasets( - self, - ) -> ListDatasetsResponse: - raise NotImplementedError("Not implemented") - @webmethod(route="/datasets/{dataset_id:path}", method="POST") async def update_dataset( self, @@ -112,3 +96,14 @@ class NvidiaDatasetIOAdapter: namespace: Optional[str] = "default", ) -> None: raise NotImplementedError("Not implemented") + + async def iterrows( + self, + dataset_id: str, + start_index: Optional[int] = None, + limit: Optional[int] = None, + ) -> IterrowsResponse: + raise NotImplementedError("Not implemented") + + async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: + raise NotImplementedError("Not implemented") diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json index 4c16411f0..5f2934ebe 100644 --- a/llama_stack/templates/dependencies.json +++ b/llama_stack/templates/dependencies.json @@ -394,6 +394,8 @@ "aiosqlite", "blobfile", "chardet", + "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index a33fa3737..a05cf97ad 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -18,6 +18,7 @@ distribution_spec: - remote::nvidia datasetio: - inline::localfs + - remote::nvidia scoring: - inline::basic tool_runtime: diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 463c13879..ae0ab32b9 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput +from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES @@ -23,7 +24,7 @@ def get_distribution_template() -> DistributionTemplate: "telemetry": ["inline::meta-reference"], "eval": ["remote::nvidia"], "post_training": ["remote::nvidia"], - "datasetio": ["inline::localfs"], + "datasetio": ["inline::localfs", "remote::nvidia"], "scoring": ["inline::basic"], "tool_runtime": ["inline::rag-runtime"], } @@ -51,6 +52,11 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.SAFETY_MODEL}", provider_id="nvidia", ) + datasetio_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NvidiaDatasetIOConfig.sample_run_config(), + ) available_models = { "nvidia": MODEL_ENTRIES, @@ -75,6 +81,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "datasetio": [datasetio_provider], "eval": [eval_provider], }, default_models=default_models, diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml index a3e5fefa4..31a454fa1 100644 --- a/llama_stack/templates/nvidia/run-with-safety.yaml +++ b/llama_stack/templates/nvidia/run-with-safety.yaml @@ -74,6 +74,13 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db + - provider_id: nvidia + provider_type: remote::nvidia + config: + api_key: ${env.NVIDIA_API_KEY:} + user_id: ${env.NVIDIA_USER_ID:llama-stack-user} + dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default} + project_id: ${env.NVIDIA_PROJECT_ID:test-project} scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 271ce1a16..3b421c0b4 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -62,13 +62,13 @@ providers: project_id: ${env.NVIDIA_PROJECT_ID:test-project} customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:http://nemo.test} datasetio: - - provider_id: localfs - provider_type: inline::localfs + - provider_id: nvidia + provider_type: remote::nvidia config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db + api_key: ${env.NVIDIA_API_KEY:} + user_id: ${env.NVIDIA_USER_ID:llama-stack-user} + dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default} + project_id: ${env.NVIDIA_PROJECT_ID:test-project} scoring: - provider_id: basic provider_type: inline::basic