add datastore initial code

2026-01-02 08:30:10 +00:00 · 2025-03-14 14:45:00 +00:00 · 2025-03-14 14:45:00 +00:00 · 0a2af0e2f8
commit 0a2af0e2f8
parent d667a7109f
9 changed files with 293 additions and 9 deletions
--- a/llama_stack/providers/registry/datasets.py
+++ b/llama_stack/providers/registry/datasets.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.providers.datatypes import (
+    AdapterSpec,
+    Api,
+    ProviderSpec,
+    remote_provider_spec,
+)
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        remote_provider_spec(
+            api=Api.datasets,
+            adapter=AdapterSpec(
+                adapter_type="nvidia",
+                pip_packages=[
+                    "datasets",
+                ],
+                module="llama_stack.providers.remote.datasets.nvidia",
+                config_class="llama_stack.providers.remote.datasets.nvidia.NvidiaDatasetConfig",
+            ),
+        ),
+    ]
--- a/llama_stack/providers/remote/datasets/nvidia/README.md
+++ b/llama_stack/providers/remote/datasets/nvidia/README.md
--- a/llama_stack/providers/remote/datasets/nvidia/init.py
+++ b/llama_stack/providers/remote/datasets/nvidia/init.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import NvidiaDatasetConfig
+
+
+async def get_adapter_impl(
+    config: NvidiaDatasetConfig,
+    _deps,
+):
+    from .datasets import NvidiaDatasetAdapter
+
+    if not isinstance(config, NvidiaDatasetConfig):
+        raise RuntimeError(f"Unexpected config type: {type(config)}")
+
+    impl = NvidiaDatasetAdapter(config)
+    return impl
+
+
+__all__ = ["get_adapter_impl", "NvidiaDatasetAdapter"]
--- a/llama_stack/providers/remote/datasets/nvidia/config.py
+++ b/llama_stack/providers/remote/datasets/nvidia/config.py
@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import warnings
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+
+class NvidiaDatasetConfig(BaseModel):
+    """Configuration for NVIDIA Dataset implementation."""
+
+    api_key: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
+        description="The NVIDIA API key.",
+    )
+
+    dataset_namespace: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
+        description="The NVIDIA dataset namespace.",
+    )
+
+    access_policies: Optional[dict] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_ACCESS_POLICIES", {}),
+        description="The NVIDIA access policies.",
+    )
+
+    project_id: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
+        description="The NVIDIA project ID.",
+    )
+
+    datasets_url: str = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"),
+        description="Base URL for the NeMo Dataset API",
+    )
+
+    # warning for default values
+    def __post_init__(self):
+        default_values = []
+        if os.getenv("NVIDIA_PROJECT_ID") is None:
+            default_values.append("project_id='test-project'")
+        if os.getenv("NVIDIA_DATASET_NAMESPACE") is None:
+            default_values.append("dataset_namespace='default'")
+        if os.getenv("NVIDIA_ACCESS_POLICIES") is None:
+            default_values.append("access_policies='{}'")
+        if os.getenv("NVIDIA_DATASETS_URL") is None:
+            default_values.append("datasets_url='http://nemo.test'")
+
+        if default_values:
+            warnings.warn(
+                f"Using default values: {', '.join(default_values)}. \
+                          Please set the environment variables to avoid this default behavior.",
+                stacklevel=2,
+            )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+        return {
+            "api_key": "${env.NVIDIA_API_KEY:}",
+            "user_id": "${env.NVIDIA_USER_ID:llama-stack-user}",
+            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
+            "access_policies": "${env.NVIDIA_ACCESS_POLICIES:}",
+            "project_id": "${env.NVIDIA_PROJECT_ID:test-project}",
+            "customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:}",
+            "output_model_dir": "${env.NVIDIA_OUTPUT_MODEL_DIR:test-example-model@v1}",
+        }
--- a/llama_stack/providers/remote/datasets/nvidia/datasets.py
+++ b/llama_stack/providers/remote/datasets/nvidia/datasets.py
@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from datetime import datetime
+from typing import Any, Dict, Literal, Optional
+
+import aiohttp
+
+from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.common.type_system import ParamType
+from llama_stack.apis.datasets.datasets import Dataset, ListDatasetsResponse
+from llama_stack.apis.resource import ResourceType
+from llama_stack.schema_utils import webmethod
+
+from .config import NvidiaDatasetConfig
+
+
+class NvidiaDatasetAdapter:
+    """Nvidia NeMo Dataset API."""
+
+    type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value
+
+    def __init__(self, config: NvidiaDatasetConfig):
+        self.config = config
+        self.headers = {}
+        if config.api_key:
+            self.headers["Authorization"] = f"Bearer {config.api_key}"
+
+    async def _make_request(
+        self,
+        method: str,
+        path: str,
+        headers: Optional[Dict[str, Any]] = None,
+        params: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Helper method to make HTTP requests to the Customizer API."""
+        url = f"{self.config.datasets_url}{path}"
+        request_headers = self.headers.copy()  # Create a copy to avoid modifying the original
+
+        if headers:
+            request_headers.update(headers)
+
+        # Add content-type header for JSON requests
+        if json and "Content-Type" not in request_headers:
+            request_headers["Content-Type"] = "application/json"
+
+        async with aiohttp.ClientSession(headers=request_headers) as session:
+            async with session.request(method, url, params=params, json=json, **kwargs) as response:
+                if response.status >= 400:
+                    error_data = await response.json()
+                    raise Exception(f"API request failed: {error_data}")
+                return await response.json()
+
+    @webmethod(route="/datasets", method="POST")
+    async def register_dataset(
+        self,
+        dataset_id: str,
+        dataset_schema: Dict[str, ParamType],
+        url: URL,
+        provider_dataset_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Register a new dataset.
+
+        Args:
+            dataset_id: The ID of the dataset.
+            dataset_schema: The schema of the dataset.
+            url: The URL of the dataset.
+            provider_dataset_id: The ID of the provider dataset.
+            provider_id: The ID of the provider.
+            metadata: The metadata of the dataset.
+
+        Returns:
+            None
+        """
+        ...
+
+    @webmethod(route="/datasets/{dataset_id:namespace}", method="GET")
+    async def get_dataset(
+        self,
+        dataset_id: str,
+    ) -> Optional[Dataset]:
+        dataset_id, namespace = dataset_id.split(":")
+        dataset = await self._make_request(
+            method="GET",
+            path=f"/v1/datasets/{namespace}/{dataset_id}",
+        )
+        created_at = datetime.fromisoformat(dataset.pop("created_at")) if "created_at" in dataset else datetime.now()
+        identifier = dataset.pop("name")
+        url = URL(uri=dataset.pop("files_url"))
+        return Dataset(
+            identifier=identifier,
+            provider_id="nvidia",  # confirm this
+            url=url,
+            dataset_schema={},  # ToDo: get schema from the dataset
+            created_at=created_at,
+            metadata=dataset,
+        )
+
+    @webmethod(route="/datasets", method="GET")
+    async def list_datasets(
+        self,
+    ) -> ListDatasetsResponse:
+        ## ToDo: add pagination
+        response = await self._make_request(method="GET", path="/v1/datasets")
+        datasets = []
+        for dataset in response["data"]:
+            created_at = (
+                datetime.fromisoformat(dataset.pop("created_at")) if "created_at" in dataset else datetime.now()
+            )
+            identifier = dataset.pop("name")
+            url = URL(uri=dataset.pop("files_url"))
+            datasets.append(
+                Dataset(
+                    identifier=identifier,
+                    provider_id="nvidia",  # confirm this
+                    url=url,
+                    dataset_schema={},
+                    created_at=created_at,
+                    metadata=dataset,
+                )
+            )  # add remaining fields as metadata
+
+        return ListDatasetsResponse(data=datasets)
+
+    @webmethod(route="/datasets/{dataset_id:path}", method="POST")
+    async def update_dataset(
+        self,
+        dataset_id: str,
+        dataset_schema: Dict[str, ParamType],
+        url: URL,
+        provider_dataset_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
+
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
+    async def unregister_dataset(
+        self,
+        dataset_id: str,
+        namespace: Optional[str] = "default",
+    ) -> None: ...