datasets api

2025-12-10 03:30:58 +00:00 · 2024-10-14 13:16:39 -07:00 · 2024-10-14 13:16:39 -07:00 · f046899a1c
commit f046899a1c
parent 18fe966e96
15 changed files with 281 additions and 80 deletions
--- a/llama_stack/apis/datasets/init.py
+++ b/llama_stack/apis/datasets/init.py
@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from .dataset import *  # noqa: F401 F403
+from .datasets import *  # noqa: F401 F403
--- a/llama_stack/apis/datasets/client.py
+++ b/llama_stack/apis/datasets/client.py
@ -0,0 +1,92 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 import json
 import fire
 import httpx
 from .datasets import *  # noqa: F403
 class DatasetClient(Datasets):
    def __init__(self, base_url: str):
        self.base_url = base_url
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def create_dataset(
        self,
        dataset_def: DatasetDef,
    ) -> None:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                f"{self.base_url}/datasets/create",
                json={
                    "dataset_def": json.loads(dataset_def.json()),
                },
                headers={"Content-Type": "application/json"},
                timeout=60,
            )
            response.raise_for_status()
            return None
    async def get_dataset(
        self,
        dataset_identifier: str,
    ) -> DatasetDef:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                f"{self.base_url}/datasets/create",
                json={
                    "dataset_identifier": dataset_identifier,
                },
                headers={"Content-Type": "application/json"},
                timeout=60,
            )
            response.raise_for_status()
            return DatasetDef(**response.json())
    async def delete_dataset(
        self,
        dataset_identifier: str,
    ) -> DatasetDef:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                f"{self.base_url}/datasets/delete",
                json={
                    "dataset_identifier": dataset_identifier,
                },
                headers={"Content-Type": "application/json"},
                timeout=60,
            )
            response.raise_for_status()
            return None
 async def run_main(host: str, port: int):
    client = DatasetClient(f"http://{host}:{port}")
    # Custom Eval Task
    response = await client.create_dataset(
        dataset_def=CustomDatasetDef(
            identifier="test-dataset",
            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
        ),
    )
 def main(host: str, port: int):
    asyncio.run(run_main(host, port))
 if __name__ == "__main__":
    fire.Fire(main)
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -143,19 +143,19 @@ class BaseDataset(ABC, Generic[TDatasetSample]):
 class Datasets(Protocol):
    @webmethod(route="/datasets/create")
-    def create_dataset(
+    async def create_dataset(
        self,
-        dataset: DatasetDef,
+        dataset_def: DatasetDef,
    ) -> None: ...
    @webmethod(route="/datasets/get")
-    def get_dataset(
+    async def get_dataset(
        self,
        dataset_identifier: str,
    ) -> DatasetDef: ...
    @webmethod(route="/datasets/delete")
-    def delete_dataset(
+    async def delete_dataset(
        self,
-        dataset_uuid: str,
+        dataset_identifier: str,
    ) -> None: ...
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@ -11,7 +11,7 @@ from llama_models.schema_utils import webmethod
 from pydantic import BaseModel
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 class EvaluationJob(BaseModel):
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec):
    pip_packages: List[str] = Field(default_factory=list)
 # Example: /datasets
 class RegistryProviderSpec(ProviderSpec):
    provider_type: str = "registry"
    config_class: str = ""
    docker_image: Optional[str] = None
    module: str
    pip_packages: List[str] = Field(default_factory=list)
 class DistributionSpec(BaseModel):
    description: Optional[str] = Field(
        default="",
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel):
    router_api: Api
 class RegistryApiInfo(BaseModel):
    registry_api: Api
    # registry: Registry
 def builtin_registry_apis() -> List[RegistryApiInfo]:
    return [
        RegistryApiInfo(
            registry_api=Api.datasets,
        )
    ]
 def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
    return [
        AutoRoutedApiInfo(
@ -42,7 +55,12 @@ def providable_apis() -> List[Api]:
    routing_table_apis = set(
        x.routing_table_api for x in builtin_automatically_routed_apis()
    )
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    registry_apis = set(
        x.registry_api for x in builtin_registry_apis() if x.registry_api
    )
    non_providable_apis = routing_table_apis | registry_apis | {Api.inspect}
    return [api for api in Api if api not in non_providable_apis]
 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
--- a/llama_stack/distribution/registry/init.py
+++ b/llama_stack/distribution/registry/init.py
@ -3,3 +3,20 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any
 from llama_stack.providers.datatypes import Api
 from .datasets.dataset import DatasetRegistryImpl
 async def get_registry_impl(api: Api, _deps) -> Any:
    api_to_registry = {
        "datasets": DatasetRegistryImpl,
    }
    if api.value not in api_to_registry:
        raise ValueError(f"API {api.value} not found in registry map")
    impl = api_to_registry[api.value]()
    await impl.initialize()
    return impl
--- a/llama_stack/distribution/registry/datasets/init.py
+++ b/llama_stack/distribution/registry/datasets/init.py
@ -5,9 +5,9 @@
 # the root directory of this source tree.
 # TODO: make these import config based
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from ..registry import Registry
-from .dataset import CustomDataset, HuggingfaceDataset
+from .dataset_wrappers import CustomDataset, HuggingfaceDataset
 class DatasetRegistry(Registry[BaseDataset]):
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@ -3,76 +3,38 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import pandas
 from datasets import Dataset, load_dataset
-from llama_stack.apis.dataset import *  # noqa: F403
+# from llama_stack.apis.datasets import *
 # from llama_stack.distribution.registry.datasets import DatasetRegistry  # noqa: F403
 # from ..registry import Registry
 # from .dataset_wrappers import CustomDataset, HuggingfaceDataset
-class CustomDataset(BaseDataset[DictSample]):
+class DatasetRegistryImpl(Datasets):
-    def __init__(self, config: CustomDatasetDef) -> None:
+    """API Impl to interact with underlying dataset registry"""
        super().__init__()
        self.config = config
        self.dataset = None
        self.index = 0
-    @property
+    def __init__(
-    def dataset_id(self) -> str:
+        self,
-        return self.config.identifier
+    ) -> None:
        pass
-    def __iter__(self) -> Iterator[DictSample]:
+    async def initialize(self) -> None:
-        if not self.dataset:
+        pass
            self.load()
        return (DictSample(data=x) for x in self.dataset)
-    def __str__(self) -> str:
+    async def shutdown(self) -> None:
-        return f"CustomDataset({self.config})"
+        pass
-    def __len__(self) -> int:
+    async def create_dataset(
-        if not self.dataset:
+        self,
-            self.load()
+        dataset_def: DatasetDef,
-        return len(self.dataset)
+    ) -> None:
        print(f"Creating dataset {dataset.identifier}")
-    def load(self, n_samples: Optional[int] = None) -> None:
+    async def get_dataset(
-        if self.dataset:
+        self,
-            return
+        dataset_identifier: str,
    ) -> DatasetDef:
        pass
-        # TODO: better support w/ data url
+    async def delete_dataset(self, dataset_identifier: str) -> None:
-        if self.config.url.endswith(".csv"):
+        pass
            df = pandas.read_csv(self.config.url)
        elif self.config.url.endswith(".xlsx"):
            df = pandas.read_excel(self.config.url)
        if n_samples is not None:
            df = df.sample(n=n_samples)
        self.dataset = Dataset.from_pandas(df)
 class HuggingfaceDataset(BaseDataset[DictSample]):
    def __init__(self, config: HuggingfaceDatasetDef):
        super().__init__()
        self.config = config
        self.dataset = None
    @property
    def dataset_id(self) -> str:
        return self.config.identifier
    def __iter__(self) -> Iterator[DictSample]:
        if not self.dataset:
            self.load()
        return (DictSample(data=x) for x in self.dataset)
    def __str__(self):
        return f"HuggingfaceDataset({self.config})"
    def __len__(self):
        if not self.dataset:
            self.load()
        return len(self.dataset)
    def load(self):
        if self.dataset:
            return
        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@ -0,0 +1,78 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import pandas
 from datasets import Dataset, load_dataset
 from llama_stack.apis.datasets import *  # noqa: F403
 class CustomDataset(BaseDataset[DictSample]):
    def __init__(self, config: CustomDatasetDef) -> None:
        super().__init__()
        self.config = config
        self.dataset = None
        self.index = 0
    @property
    def dataset_id(self) -> str:
        return self.config.identifier
    def __iter__(self) -> Iterator[DictSample]:
        if not self.dataset:
            self.load()
        return (DictSample(data=x) for x in self.dataset)
    def __str__(self) -> str:
        return f"CustomDataset({self.config})"
    def __len__(self) -> int:
        if not self.dataset:
            self.load()
        return len(self.dataset)
    def load(self, n_samples: Optional[int] = None) -> None:
        if self.dataset:
            return
        # TODO: better support w/ data url
        if self.config.url.endswith(".csv"):
            df = pandas.read_csv(self.config.url)
        elif self.config.url.endswith(".xlsx"):
            df = pandas.read_excel(self.config.url)
        if n_samples is not None:
            df = df.sample(n=n_samples)
        self.dataset = Dataset.from_pandas(df)
 class HuggingfaceDataset(BaseDataset[DictSample]):
    def __init__(self, config: HuggingfaceDatasetDef):
        super().__init__()
        self.config = config
        self.dataset = None
    @property
    def dataset_id(self) -> str:
        return self.config.identifier
    def __iter__(self) -> Iterator[DictSample]:
        if not self.dataset:
            self.load()
        return (DictSample(data=x) for x in self.dataset)
    def __str__(self):
        return f"HuggingfaceDataset({self.config})"
    def __len__(self):
        if not self.dataset:
            self.load()
        return len(self.dataset)
    def load(self):
        if self.dataset:
            return
        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -12,6 +12,7 @@ from llama_stack.providers.datatypes import *  # noqa: F403
 from llama_stack.distribution.datatypes import *  # noqa: F403
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.evals import Evals
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
@ -23,6 +24,7 @@ from llama_stack.apis.shields import Shields
 from llama_stack.apis.telemetry import Telemetry
 from llama_stack.distribution.distribution import (
    builtin_automatically_routed_apis,
    builtin_registry_apis,
    get_provider_registry,
 )
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
@ -40,6 +42,7 @@ def api_protocol_map() -> Dict[Api, Any]:
        Api.shields: Shields,
        Api.telemetry: Telemetry,
        Api.evals: Evals,
        Api.datasets: Datasets,
    }
@ -139,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An
            )
        }
    for info in builtin_registry_apis():
        providers_with_specs[info.registry_api.value] = {
            "__builtin__": ProviderWithSpec(
                provider_id="__registry__",
                provider_type="__registry__",
                config={},
                spec=RegistryProviderSpec(
                    api=info.registry_api,
                    module="llama_stack.distribution.registry",
                    deps__=[],
                ),
            )
        }
    sorted_providers = topological_sort(
        {k: v.values() for k, v in providers_with_specs.items()}
    )
@ -259,6 +276,12 @@ async def instantiate_provider(
        config = None
        args = [provider_spec.api, inner_impls, deps]
    elif isinstance(provider_spec, RegistryProviderSpec):
        print("ROUTER PROVIDER SPEC")
        method = "get_registry_impl"
        config = None
        args = [provider_spec.api, deps]
    else:
        method = "get_provider_impl"
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -28,11 +28,13 @@ class Api(Enum):
    models = "models"
    shields = "shields"
    memory_banks = "memory_banks"
    evals = "evals"
    # built-in API
    inspect = "inspect"
    evals = "evals"
    datasets = "datasets"
 class ModelsProtocolPrivate(Protocol):
    async def list_models(self) -> List[ModelDef]: ...
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@ -9,11 +9,9 @@ from termcolor import cprint
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from .config import MetaReferenceEvalsImplConfig
 # from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
 from .tasks.run_eval_task import RunEvalTask
@ -47,7 +45,7 @@ class MetaReferenceEvalsImpl(Evals):
            eval_task_config = EvaluateTaskConfig(
                dataset_config=EvaluateDatasetConfig(
                    dataset_name=dataset,
-                    row_limit=2,
+                    row_limit=3,
                ),
                generation_config=EvaluateModelGenerationConfig(
                    model=model,
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@ -6,7 +6,7 @@
 import random
 from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
-from llama_stack.apis.dataset.dataset import *  # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
 class AggregateScorer(BaseScorer[ScorerInputSample]):
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@ -12,6 +12,7 @@ apis:
 - inference
 - safety
 - evals
 - datasets
 providers:
  evals:
  - provider_id: meta-reference