datasets api

2025-07-29 07:14:20 +00:00 · 2024-10-14 13:16:39 -07:00 · 2024-10-14 13:16:39 -07:00 · f046899a1c
commit f046899a1c
parent 18fe966e96
15 changed files with 281 additions and 80 deletions
--- a/llama_stack/apis/datasets/init.py
+++ b/llama_stack/apis/datasets/init.py
@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from .dataset import *  # noqa: F401 F403
+from .datasets import *  # noqa: F401 F403
--- a/llama_stack/apis/datasets/client.py
+++ b/llama_stack/apis/datasets/client.py
@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+
+import fire
+import httpx
+
+from .datasets import *  # noqa: F403
+
+
+class DatasetClient(Datasets):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> None:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/create",
+                json={
+                    "dataset_def": json.loads(dataset_def.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return None
+
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/create",
+                json={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return DatasetDef(**response.json())
+
+    async def delete_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/delete",
+                json={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return None
+
+
+async def run_main(host: str, port: int):
+    client = DatasetClient(f"http://{host}:{port}")
+
+    # Custom Eval Task
+    response = await client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="test-dataset",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
+    )
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -143,19 +143,19 @@ class BaseDataset(ABC, Generic[TDatasetSample]):

 class Datasets(Protocol):
    @webmethod(route="/datasets/create")
-    def create_dataset(
+    async def create_dataset(
        self,
-        dataset: DatasetDef,
+        dataset_def: DatasetDef,
    ) -> None: ...

    @webmethod(route="/datasets/get")
-    def get_dataset(
+    async def get_dataset(
        self,
        dataset_identifier: str,
    ) -> DatasetDef: ...

    @webmethod(route="/datasets/delete")
-    def delete_dataset(
+    async def delete_dataset(
        self,
-        dataset_uuid: str,
+        dataset_identifier: str,
    ) -> None: ...
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@ -11,7 +11,7 @@ from llama_models.schema_utils import webmethod
 from pydantic import BaseModel

 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403


 class EvaluationJob(BaseModel):
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec):
    pip_packages: List[str] = Field(default_factory=list)


+# Example: /datasets
+class RegistryProviderSpec(ProviderSpec):
+    provider_type: str = "registry"
+    config_class: str = ""
+    docker_image: Optional[str] = None
+
+    module: str
+    pip_packages: List[str] = Field(default_factory=list)
+
+
 class DistributionSpec(BaseModel):
    description: Optional[str] = Field(
        default="",
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel):
    router_api: Api


+class RegistryApiInfo(BaseModel):
+    registry_api: Api
+    # registry: Registry
+
+
+def builtin_registry_apis() -> List[RegistryApiInfo]:
+    return [
+        RegistryApiInfo(
+            registry_api=Api.datasets,
+        )
+    ]
+
+
 def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
    return [
        AutoRoutedApiInfo(
@ -42,7 +55,12 @@ def providable_apis() -> List[Api]:
    routing_table_apis = set(
        x.routing_table_api for x in builtin_automatically_routed_apis()
    )
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    registry_apis = set(
+        x.registry_api for x in builtin_registry_apis() if x.registry_api
+    )
+    non_providable_apis = routing_table_apis | registry_apis | {Api.inspect}
+
+    return [api for api in Api if api not in non_providable_apis]


 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
--- a/llama_stack/distribution/registry/init.py
+++ b/llama_stack/distribution/registry/init.py
@ -3,3 +3,20 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any
+
+from llama_stack.providers.datatypes import Api
+from .datasets.dataset import DatasetRegistryImpl
+
+
+async def get_registry_impl(api: Api, _deps) -> Any:
+    api_to_registry = {
+        "datasets": DatasetRegistryImpl,
+    }
+
+    if api.value not in api_to_registry:
+        raise ValueError(f"API {api.value} not found in registry map")
+
+    impl = api_to_registry[api.value]()
+    await impl.initialize()
+    return impl
--- a/llama_stack/distribution/registry/datasets/init.py
+++ b/llama_stack/distribution/registry/datasets/init.py
@ -5,9 +5,9 @@
 # the root directory of this source tree.

 # TODO: make these import config based
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from ..registry import Registry
-from .dataset import CustomDataset, HuggingfaceDataset
+from .dataset_wrappers import CustomDataset, HuggingfaceDataset


 class DatasetRegistry(Registry[BaseDataset]):
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@ -3,76 +3,38 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import pandas
-from datasets import Dataset, load_dataset

-from llama_stack.apis.dataset import *  # noqa: F403
+# from llama_stack.apis.datasets import *
+# from llama_stack.distribution.registry.datasets import DatasetRegistry  # noqa: F403
+# from ..registry import Registry
+# from .dataset_wrappers import CustomDataset, HuggingfaceDataset


-class CustomDataset(BaseDataset[DictSample]):
-    def __init__(self, config: CustomDatasetDef) -> None:
-        super().__init__()
-        self.config = config
-        self.dataset = None
-        self.index = 0
+class DatasetRegistryImpl(Datasets):
+    """API Impl to interact with underlying dataset registry"""

-    @property
-    def dataset_id(self) -> str:
-        return self.config.identifier
+    def __init__(
+        self,
+    ) -> None:
+        pass

-    def __iter__(self) -> Iterator[DictSample]:
-        if not self.dataset:
-            self.load()
-        return (DictSample(data=x) for x in self.dataset)
+    async def initialize(self) -> None:
+        pass

-    def __str__(self) -> str:
-        return f"CustomDataset({self.config})"
+    async def shutdown(self) -> None:
+        pass

-    def __len__(self) -> int:
-        if not self.dataset:
-            self.load()
-        return len(self.dataset)
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> None:
+        print(f"Creating dataset {dataset.identifier}")

-    def load(self, n_samples: Optional[int] = None) -> None:
-        if self.dataset:
-            return
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        pass

-        # TODO: better support w/ data url
-        if self.config.url.endswith(".csv"):
-            df = pandas.read_csv(self.config.url)
-        elif self.config.url.endswith(".xlsx"):
-            df = pandas.read_excel(self.config.url)
-
-        if n_samples is not None:
-            df = df.sample(n=n_samples)
-
-        self.dataset = Dataset.from_pandas(df)
-
-
-class HuggingfaceDataset(BaseDataset[DictSample]):
-    def __init__(self, config: HuggingfaceDatasetDef):
-        super().__init__()
-        self.config = config
-        self.dataset = None
-
-    @property
-    def dataset_id(self) -> str:
-        return self.config.identifier
-
-    def __iter__(self) -> Iterator[DictSample]:
-        if not self.dataset:
-            self.load()
-        return (DictSample(data=x) for x in self.dataset)
-
-    def __str__(self):
-        return f"HuggingfaceDataset({self.config})"
-
-    def __len__(self):
-        if not self.dataset:
-            self.load()
-        return len(self.dataset)
-
-    def load(self):
-        if self.dataset:
-            return
-        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
+    async def delete_dataset(self, dataset_identifier: str) -> None:
+        pass
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import pandas
+from datasets import Dataset, load_dataset
+
+from llama_stack.apis.datasets import *  # noqa: F403
+
+
+class CustomDataset(BaseDataset[DictSample]):
+    def __init__(self, config: CustomDatasetDef) -> None:
+        super().__init__()
+        self.config = config
+        self.dataset = None
+        self.index = 0
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self) -> str:
+        return f"CustomDataset({self.config})"
+
+    def __len__(self) -> int:
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self, n_samples: Optional[int] = None) -> None:
+        if self.dataset:
+            return
+
+        # TODO: better support w/ data url
+        if self.config.url.endswith(".csv"):
+            df = pandas.read_csv(self.config.url)
+        elif self.config.url.endswith(".xlsx"):
+            df = pandas.read_excel(self.config.url)
+
+        if n_samples is not None:
+            df = df.sample(n=n_samples)
+
+        self.dataset = Dataset.from_pandas(df)
+
+
+class HuggingfaceDataset(BaseDataset[DictSample]):
+    def __init__(self, config: HuggingfaceDatasetDef):
+        super().__init__()
+        self.config = config
+        self.dataset = None
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self):
+        return f"HuggingfaceDataset({self.config})"
+
+    def __len__(self):
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self):
+        if self.dataset:
+            return
+        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -12,6 +12,7 @@ from llama_stack.providers.datatypes import *  # noqa: F403
 from llama_stack.distribution.datatypes import *  # noqa: F403

 from llama_stack.apis.agents import Agents
+from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.evals import Evals
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
@ -23,6 +24,7 @@ from llama_stack.apis.shields import Shields
 from llama_stack.apis.telemetry import Telemetry
 from llama_stack.distribution.distribution import (
    builtin_automatically_routed_apis,
+    builtin_registry_apis,
    get_provider_registry,
 )
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
@ -40,6 +42,7 @@ def api_protocol_map() -> Dict[Api, Any]:
        Api.shields: Shields,
        Api.telemetry: Telemetry,
        Api.evals: Evals,
+        Api.datasets: Datasets,
    }


@ -139,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An
            )
        }

+    for info in builtin_registry_apis():
+        providers_with_specs[info.registry_api.value] = {
+            "__builtin__": ProviderWithSpec(
+                provider_id="__registry__",
+                provider_type="__registry__",
+                config={},
+                spec=RegistryProviderSpec(
+                    api=info.registry_api,
+                    module="llama_stack.distribution.registry",
+                    deps__=[],
+                ),
+            )
+        }
+
    sorted_providers = topological_sort(
        {k: v.values() for k, v in providers_with_specs.items()}
    )
@ -259,6 +276,12 @@ async def instantiate_provider(

        config = None
        args = [provider_spec.api, inner_impls, deps]
+    elif isinstance(provider_spec, RegistryProviderSpec):
+        print("ROUTER PROVIDER SPEC")
+        method = "get_registry_impl"
+
+        config = None
+        args = [provider_spec.api, deps]
    else:
        method = "get_provider_impl"

--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -28,11 +28,13 @@ class Api(Enum):
    models = "models"
    shields = "shields"
    memory_banks = "memory_banks"
-    evals = "evals"

    # built-in API
    inspect = "inspect"

+    evals = "evals"
+    datasets = "datasets"
+

 class ModelsProtocolPrivate(Protocol):
    async def list_models(self) -> List[ModelDef]: ...
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@ -9,11 +9,9 @@ from termcolor import cprint

 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403

 from .config import MetaReferenceEvalsImplConfig
-
-# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
 from .tasks.run_eval_task import RunEvalTask


@ -47,7 +45,7 @@ class MetaReferenceEvalsImpl(Evals):
            eval_task_config = EvaluateTaskConfig(
                dataset_config=EvaluateDatasetConfig(
                    dataset_name=dataset,
-                    row_limit=2,
+                    row_limit=3,
                ),
                generation_config=EvaluateModelGenerationConfig(
                    model=model,
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@ -6,7 +6,7 @@
 import random

 from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
-from llama_stack.apis.dataset.dataset import *  # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403


 class AggregateScorer(BaseScorer[ScorerInputSample]):
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@ -12,6 +12,7 @@ apis:
 - inference
 - safety
 - evals
+- datasets
 providers:
  evals:
  - provider_id: meta-reference