diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index ddb727663..d6af2a956 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -35,10 +35,10 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: routing_table_api=Api.vector_dbs, router_api=Api.vector_io, ), - AutoRoutedApiInfo( - routing_table_api=Api.datasets, - router_api=Api.datasetio, - ), + # AutoRoutedApiInfo( + # routing_table_api=Api.datasets, + # router_api=Api.datasetio, + # ), AutoRoutedApiInfo( routing_table_api=Api.scoring_functions, router_api=Api.scoring, diff --git a/llama_stack/providers/registry/datasets.py b/llama_stack/providers/registry/datasets.py new file mode 100644 index 000000000..01d448627 --- /dev/null +++ b/llama_stack/providers/registry/datasets.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.providers.datatypes import ( + AdapterSpec, + Api, + ProviderSpec, + remote_provider_spec, +) + + +def available_providers() -> List[ProviderSpec]: + return [ + remote_provider_spec( + api=Api.datasets, + adapter=AdapterSpec( + adapter_type="nvidia", + pip_packages=[ + "datasets", + ], + module="llama_stack.providers.remote.datasets.nvidia", + config_class="llama_stack.providers.remote.datasets.nvidia.NvidiaDatasetConfig", + ), + ), + ] diff --git a/llama_stack/providers/remote/datasets/nvidia/README.md b/llama_stack/providers/remote/datasets/nvidia/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/llama_stack/providers/remote/datasets/nvidia/__init__.py b/llama_stack/providers/remote/datasets/nvidia/__init__.py new file mode 100644 index 000000000..649bf6ec6 --- /dev/null +++ b/llama_stack/providers/remote/datasets/nvidia/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import NvidiaDatasetConfig + + +async def get_adapter_impl( + config: NvidiaDatasetConfig, + _deps, +): + from .datasets import NvidiaDatasetAdapter + + if not isinstance(config, NvidiaDatasetConfig): + raise RuntimeError(f"Unexpected config type: {type(config)}") + + impl = NvidiaDatasetAdapter(config) + return impl + + +__all__ = ["get_adapter_impl", "NvidiaDatasetAdapter"] diff --git a/llama_stack/providers/remote/datasets/nvidia/config.py b/llama_stack/providers/remote/datasets/nvidia/config.py new file mode 100644 index 000000000..77ddd6e6c --- /dev/null +++ b/llama_stack/providers/remote/datasets/nvidia/config.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os +import warnings +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class NvidiaDatasetConfig(BaseModel): + """Configuration for NVIDIA Dataset implementation.""" + + api_key: Optional[str] = Field( + default_factory=lambda: os.getenv("NVIDIA_API_KEY"), + description="The NVIDIA API key.", + ) + + dataset_namespace: Optional[str] = Field( + default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"), + description="The NVIDIA dataset namespace.", + ) + + access_policies: Optional[dict] = Field( + default_factory=lambda: os.getenv("NVIDIA_ACCESS_POLICIES", {}), + description="The NVIDIA access policies.", + ) + + project_id: Optional[str] = Field( + default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"), + description="The NVIDIA project ID.", + ) + + datasets_url: str = Field( + default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"), + description="Base URL for the NeMo Dataset API", + ) + + # warning for default values + def __post_init__(self): + default_values = [] + if os.getenv("NVIDIA_PROJECT_ID") is None: + default_values.append("project_id='test-project'") + if os.getenv("NVIDIA_DATASET_NAMESPACE") is None: + default_values.append("dataset_namespace='default'") + if os.getenv("NVIDIA_ACCESS_POLICIES") is None: + default_values.append("access_policies='{}'") + if os.getenv("NVIDIA_DATASETS_URL") is None: + default_values.append("datasets_url='http://nemo.test'") + + if default_values: + warnings.warn( + f"Using default values: {', '.join(default_values)}. \ + Please set the environment variables to avoid this default behavior.", + stacklevel=2, + ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "api_key": "${env.NVIDIA_API_KEY:}", + "user_id": "${env.NVIDIA_USER_ID:llama-stack-user}", + "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}", + "access_policies": "${env.NVIDIA_ACCESS_POLICIES:}", + "project_id": "${env.NVIDIA_PROJECT_ID:test-project}", + "customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:}", + "output_model_dir": "${env.NVIDIA_OUTPUT_MODEL_DIR:test-example-model@v1}", + } diff --git a/llama_stack/providers/remote/datasets/nvidia/datasets.py b/llama_stack/providers/remote/datasets/nvidia/datasets.py new file mode 100644 index 000000000..a82fafb53 --- /dev/null +++ b/llama_stack/providers/remote/datasets/nvidia/datasets.py @@ -0,0 +1,148 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from datetime import datetime +from typing import Any, Dict, Literal, Optional + +import aiohttp + +from llama_stack.apis.common.content_types import URL +from llama_stack.apis.common.type_system import ParamType +from llama_stack.apis.datasets.datasets import Dataset, ListDatasetsResponse +from llama_stack.apis.resource import ResourceType +from llama_stack.schema_utils import webmethod + +from .config import NvidiaDatasetConfig + + +class NvidiaDatasetAdapter: + """Nvidia NeMo Dataset API.""" + + type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value + + def __init__(self, config: NvidiaDatasetConfig): + self.config = config + self.headers = {} + if config.api_key: + self.headers["Authorization"] = f"Bearer {config.api_key}" + + async def _make_request( + self, + method: str, + path: str, + headers: Optional[Dict[str, Any]] = None, + params: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Dict[str, Any]: + """Helper method to make HTTP requests to the Customizer API.""" + url = f"{self.config.datasets_url}{path}" + request_headers = self.headers.copy() # Create a copy to avoid modifying the original + + if headers: + request_headers.update(headers) + + # Add content-type header for JSON requests + if json and "Content-Type" not in request_headers: + request_headers["Content-Type"] = "application/json" + + async with aiohttp.ClientSession(headers=request_headers) as session: + async with session.request(method, url, params=params, json=json, **kwargs) as response: + if response.status >= 400: + error_data = await response.json() + raise Exception(f"API request failed: {error_data}") + return await response.json() + + @webmethod(route="/datasets", method="POST") + async def register_dataset( + self, + dataset_id: str, + dataset_schema: Dict[str, ParamType], + url: URL, + provider_dataset_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + """Register a new dataset. + + Args: + dataset_id: The ID of the dataset. + dataset_schema: The schema of the dataset. + url: The URL of the dataset. + provider_dataset_id: The ID of the provider dataset. + provider_id: The ID of the provider. + metadata: The metadata of the dataset. + + Returns: + None + """ + ... + + @webmethod(route="/datasets/{dataset_id:namespace}", method="GET") + async def get_dataset( + self, + dataset_id: str, + ) -> Optional[Dataset]: + dataset_id, namespace = dataset_id.split(":") + dataset = await self._make_request( + method="GET", + path=f"/v1/datasets/{namespace}/{dataset_id}", + ) + created_at = datetime.fromisoformat(dataset.pop("created_at")) if "created_at" in dataset else datetime.now() + identifier = dataset.pop("name") + url = URL(uri=dataset.pop("files_url")) + return Dataset( + identifier=identifier, + provider_id="nvidia", # confirm this + url=url, + dataset_schema={}, # ToDo: get schema from the dataset + created_at=created_at, + metadata=dataset, + ) + + @webmethod(route="/datasets", method="GET") + async def list_datasets( + self, + ) -> ListDatasetsResponse: + ## ToDo: add pagination + response = await self._make_request(method="GET", path="/v1/datasets") + datasets = [] + for dataset in response["data"]: + created_at = ( + datetime.fromisoformat(dataset.pop("created_at")) if "created_at" in dataset else datetime.now() + ) + identifier = dataset.pop("name") + url = URL(uri=dataset.pop("files_url")) + datasets.append( + Dataset( + identifier=identifier, + provider_id="nvidia", # confirm this + url=url, + dataset_schema={}, + created_at=created_at, + metadata=dataset, + ) + ) # add remaining fields as metadata + + return ListDatasetsResponse(data=datasets) + + @webmethod(route="/datasets/{dataset_id:path}", method="POST") + async def update_dataset( + self, + dataset_id: str, + dataset_schema: Dict[str, ParamType], + url: URL, + provider_dataset_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... + + @webmethod(route="/datasets/{dataset_id:path}", method="DELETE") + async def unregister_dataset( + self, + dataset_id: str, + namespace: Optional[str] = "default", + ) -> None: ... diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index 24749ffcd..e289f4874 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -16,6 +16,8 @@ distribution_spec: - inline::meta-reference datasetio: - inline::localfs + datasets: + - remote::nvidia scoring: - inline::basic - inline::llm-as-judge diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index a28de0dc2..d3e00c2d5 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput +from llama_stack.providers.remote.datasets.nvidia import NvidiaDatasetConfig from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES from llama_stack.providers.remote.post_training.nvidia import NvidiaPostTrainingConfig @@ -24,6 +25,7 @@ def get_distribution_template() -> DistributionTemplate: "telemetry": ["inline::meta-reference"], "eval": ["inline::meta-reference"], "datasetio": ["inline::localfs"], + "datasets": ["remote::nvidia"], "scoring": ["inline::basic"], "tool_runtime": ["inline::rag-runtime"], } @@ -39,6 +41,12 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::nvidia", config=NvidiaPostTrainingConfig.sample_run_config(), ) + + datasets_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NvidiaDatasetConfig.sample_run_config(), + ) safety_provider = Provider( provider_id="nvidia", provider_type="remote::nvidia", @@ -76,6 +84,8 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "post_training": [post_training_provider], + "datasets": [datasets_provider], }, default_models=default_models, default_tool_groups=default_tool_groups, diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 7559e518d..9f84e2925 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -58,11 +58,11 @@ providers: datasetio: - provider_id: localfs provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db + config: {} + datasets: + - provider_id: nvidia + provider_type: remote::nvidia + config: {} scoring: - provider_id: basic provider_type: inline::basic