chore(package): migrate to src/ layout (#3920)

Migrates package structure to src/ layout following Python packaging best practices. All code moved from `llama_stack/` to `src/llama_stack/`. Public API unchanged - imports remain `import llama_stack.*`. Updated build configs, pre-commit hooks, scripts, and GitHub workflows accordingly. All hooks pass, package builds cleanly. **Developer note**: Reinstall after pulling: `pip install -e .`
2025-12-05 18:27:22 +00:00 · 2025-10-27 12:02:21 -07:00 · 2025-10-27 12:02:21 -07:00 · 471b1b248b
commit 471b1b248b
parent 98a5047f9d
791 changed files with 2983 additions and 456 deletions
--- a/src/llama_stack/providers/remote/datasetio/init.py
+++ b/src/llama_stack/providers/remote/datasetio/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/src/llama_stack/providers/remote/datasetio/huggingface/init.py
+++ b/src/llama_stack/providers/remote/datasetio/huggingface/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import HuggingfaceDatasetIOConfig
+
+
+async def get_adapter_impl(
+    config: HuggingfaceDatasetIOConfig,
+    _deps,
+):
+    from .huggingface import HuggingfaceDatasetIOImpl
+
+    impl = HuggingfaceDatasetIOImpl(config)
+    await impl.initialize()
+    return impl
--- a/src/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ b/src/llama_stack/providers/remote/datasetio/huggingface/config.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any
+
+from pydantic import BaseModel
+
+from llama_stack.core.storage.datatypes import KVStoreReference
+
+
+class HuggingfaceDatasetIOConfig(BaseModel):
+    kvstore: KVStoreReference
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
+        return {
+            "kvstore": KVStoreReference(
+                backend="kv_default",
+                namespace="datasetio::huggingface",
+            ).model_dump(exclude_none=True)
+        }
--- a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any
+from urllib.parse import parse_qs, urlparse
+
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Dataset
+from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
+
+from .config import HuggingfaceDatasetIOConfig
+
+DATASETS_PREFIX = "datasets:"
+
+
+def parse_hf_params(dataset_def: Dataset):
+    uri = dataset_def.source.uri
+    parsed_uri = urlparse(uri)
+    params = parse_qs(parsed_uri.query)
+    params = {k: v[0] for k, v in params.items()}
+    path = parsed_uri.path.lstrip("/")
+
+    return path, params
+
+
+class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
+    def __init__(self, config: HuggingfaceDatasetIOConfig) -> None:
+        self.config = config
+        # local registry for keeping track of datasets within the provider
+        self.dataset_infos = {}
+        self.kvstore = None
+
+    async def initialize(self) -> None:
+        self.kvstore = await kvstore_impl(self.config.kvstore)
+        # Load existing datasets from kvstore
+        start_key = DATASETS_PREFIX
+        end_key = f"{DATASETS_PREFIX}\xff"
+        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
+
+        for dataset in stored_datasets:
+            dataset = Dataset.model_validate_json(dataset)
+            self.dataset_infos[dataset.identifier] = dataset
+
+    async def shutdown(self) -> None: ...
+
+    async def register_dataset(
+        self,
+        dataset_def: Dataset,
+    ) -> None:
+        # Store in kvstore
+        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
+        await self.kvstore.set(
+            key=key,
+            value=dataset_def.model_dump_json(),
+        )
+        self.dataset_infos[dataset_def.identifier] = dataset_def
+
+    async def unregister_dataset(self, dataset_id: str) -> None:
+        key = f"{DATASETS_PREFIX}{dataset_id}"
+        await self.kvstore.delete(key=key)
+        del self.dataset_infos[dataset_id]
+
+    async def iterrows(
+        self,
+        dataset_id: str,
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
+        import datasets as hf_datasets
+
+        dataset_def = self.dataset_infos[dataset_id]
+        path, params = parse_hf_params(dataset_def)
+        loaded_dataset = hf_datasets.load_dataset(path, **params)
+
+        records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
+        return paginate_records(records, start_index, limit)
+
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        import datasets as hf_datasets
+
+        dataset_def = self.dataset_infos[dataset_id]
+        path, params = parse_hf_params(dataset_def)
+        loaded_dataset = hf_datasets.load_dataset(path, **params)
+
+        # Convert rows to HF Dataset format
+        new_dataset = hf_datasets.Dataset.from_list(rows)
+
+        # Concatenate the new rows with existing dataset
+        updated_dataset = hf_datasets.concatenate_datasets([loaded_dataset, new_dataset])
+
+        if dataset_def.metadata.get("path", None):
+            updated_dataset.push_to_hub(dataset_def.metadata["path"])
+        else:
+            raise NotImplementedError("Uploading to URL-based datasets is not supported yet")
--- a/src/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/README.md
@ -0,0 +1,73 @@
+# NVIDIA DatasetIO Provider for LlamaStack
+
+This provider enables dataset management using NVIDIA's NeMo Customizer service.
+
+## Features
+
+- Register datasets for fine-tuning LLMs
+- Unregister datasets
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to Hosted NVIDIA NeMo Microservice
+- API key for authentication with the NVIDIA service
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+uv run llama stack list-deps nvidia | xargs -L1 uv pip install
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = "your-api-key"
+os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
+os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
+os.environ["NVIDIA_PROJECT_ID"] = "test-project"
+from llama_stack.core.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+#### Register a dataset
+
+```python
+client.datasets.register(
+    purpose="post-training/messages",
+    dataset_id="my-training-dataset",
+    source={"type": "uri", "uri": "hf://datasets/default/sample-dataset"},
+    metadata={
+        "format": "json",
+        "description": "Dataset for LLM fine-tuning",
+        "provider": "nvidia",
+    },
+)
+```
+
+#### Get a list of all registered datasets
+
+```python
+datasets = client.datasets.list()
+for dataset in datasets:
+    print(f"Dataset ID: {dataset.identifier}")
+    print(f"Description: {dataset.metadata.get('description', '')}")
+    print(f"Source: {dataset.source.uri}")
+    print("---")
+```
+
+#### Unregister a dataset
+
+```python
+client.datasets.unregister(dataset_id="my-training-dataset")
+```
--- a/src/llama_stack/providers/remote/datasetio/nvidia/init.py
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/init.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import NvidiaDatasetIOConfig
+
+
+async def get_adapter_impl(
+    config: NvidiaDatasetIOConfig,
+    _deps,
+):
+    from .datasetio import NvidiaDatasetIOAdapter
+
+    if not isinstance(config, NvidiaDatasetIOConfig):
+        raise RuntimeError(f"Unexpected config type: {type(config)}")
+
+    impl = NvidiaDatasetIOAdapter(config)
+    return impl
+
+
+__all__ = ["get_adapter_impl", "NvidiaDatasetIOAdapter"]
--- a/src/llama_stack/providers/remote/datasetio/nvidia/config.py
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/config.py
@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import warnings
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class NvidiaDatasetIOConfig(BaseModel):
+    """Configuration for NVIDIA DatasetIO implementation."""
+
+    api_key: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
+        description="The NVIDIA API key.",
+    )
+
+    dataset_namespace: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
+        description="The NVIDIA dataset namespace.",
+    )
+
+    project_id: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
+        description="The NVIDIA project ID.",
+    )
+
+    datasets_url: str = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"),
+        description="Base URL for the NeMo Dataset API",
+    )
+
+    # warning for default values
+    def __post_init__(self):
+        default_values = []
+        if os.getenv("NVIDIA_PROJECT_ID") is None:
+            default_values.append("project_id='test-project'")
+        if os.getenv("NVIDIA_DATASET_NAMESPACE") is None:
+            default_values.append("dataset_namespace='default'")
+        if os.getenv("NVIDIA_DATASETS_URL") is None:
+            default_values.append("datasets_url='http://nemo.test'")
+
+        if default_values:
+            warnings.warn(
+                f"Using default values: {', '.join(default_values)}. \
+                          Please set the environment variables to avoid this default behavior.",
+                stacklevel=2,
+            )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+        return {
+            "api_key": "${env.NVIDIA_API_KEY:=}",
+            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:=default}",
+            "project_id": "${env.NVIDIA_PROJECT_ID:=test-project}",
+            "datasets_url": "${env.NVIDIA_DATASETS_URL:=http://nemo.test}",
+        }
--- a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+import aiohttp
+
+from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.common.type_system import ParamType
+from llama_stack.apis.datasets import Dataset
+
+from .config import NvidiaDatasetIOConfig
+
+
+class NvidiaDatasetIOAdapter:
+    """Nvidia NeMo DatasetIO API."""
+
+    def __init__(self, config: NvidiaDatasetIOConfig):
+        self.config = config
+        self.headers = {}
+
+    async def _make_request(
+        self,
+        method: str,
+        path: str,
+        headers: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Helper method to make HTTP requests to the Customizer API."""
+        url = f"{self.config.datasets_url}{path}"
+        request_headers = self.headers.copy()
+
+        # Set default Content-Type for JSON requests
+        if json is not None:
+            request_headers["Content-Type"] = "application/json"
+
+        if headers:
+            request_headers.update(headers)
+
+        async with aiohttp.ClientSession(headers=request_headers) as session:
+            async with session.request(method, url, params=params, json=json, **kwargs) as response:
+                if response.status != 200:
+                    error_data = await response.json()
+                    raise Exception(f"API request failed: {error_data}")
+                return await response.json()
+
+    async def register_dataset(
+        self,
+        dataset_def: Dataset,
+    ) -> Dataset:
+        """Register a new dataset.
+
+        Args:
+            dataset_def [Dataset]: The dataset definition.
+                dataset_id [str]: The ID of the dataset.
+                source [DataSource]: The source of the dataset.
+                metadata [Dict[str, Any]]: The metadata of the dataset.
+                    format [str]: The format of the dataset.
+                    description [str]: The description of the dataset.
+        Returns:
+            Dataset
+        """
+        # add warnings for unsupported params
+        request_body = {
+            "name": dataset_def.identifier,
+            "namespace": self.config.dataset_namespace,
+            "files_url": dataset_def.source.uri,
+            "project": self.config.project_id,
+        }
+        if dataset_def.metadata:
+            request_body["format"] = dataset_def.metadata.get("format")
+            request_body["description"] = dataset_def.metadata.get("description")
+        await self._make_request(
+            "POST",
+            "/v1/datasets",
+            json=request_body,
+        )
+        return dataset_def
+
+    async def update_dataset(
+        self,
+        dataset_id: str,
+        dataset_schema: dict[str, ParamType],
+        url: URL,
+        provider_dataset_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        raise NotImplementedError("Not implemented")
+
+    async def unregister_dataset(
+        self,
+        dataset_id: str,
+    ) -> None:
+        await self._make_request(
+            "DELETE",
+            f"/v1/datasets/{self.config.dataset_namespace}/{dataset_id}",
+            headers={"Accept": "application/json", "Content-Type": "application/json"},
+        )
+
+    async def iterrows(
+        self,
+        dataset_id: str,
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
+        raise NotImplementedError("Not implemented")
+
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        raise NotImplementedError("Not implemented")