From e709101f5b9f2731f7083ce2cb76f8dd327c172c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 22 Oct 2024 08:24:14 -0700 Subject: [PATCH] datasets api --- llama_stack/apis/dataset/dataset.py | 63 ------------------ llama_stack/apis/datasetio/__init__.py | 7 ++ llama_stack/apis/datasetio/datasetio.py | 5 ++ .../apis/{dataset => datasets}/__init__.py | 0 llama_stack/apis/datasets/datasets.py | 65 +++++++++++++++++++ 5 files changed, 77 insertions(+), 63 deletions(-) delete mode 100644 llama_stack/apis/dataset/dataset.py create mode 100644 llama_stack/apis/datasetio/__init__.py create mode 100644 llama_stack/apis/datasetio/datasetio.py rename llama_stack/apis/{dataset => datasets}/__init__.py (100%) create mode 100644 llama_stack/apis/datasets/datasets.py diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py deleted file mode 100644 index 2fa8bb4e5..000000000 --- a/llama_stack/apis/dataset/dataset.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from enum import Enum -from typing import Any, Dict, Optional, Protocol - -from llama_models.llama3.api.datatypes import URL - -from llama_models.schema_utils import json_schema_type, webmethod - -from pydantic import BaseModel - - -@json_schema_type -class TrainEvalDatasetColumnType(Enum): - dialog = "dialog" - text = "text" - media = "media" - number = "number" - json = "json" - - -@json_schema_type -class TrainEvalDataset(BaseModel): - """Dataset to be used for training or evaluating language models.""" - - # TODO(ashwin): figure out if we need to add an enum for a "dataset type" - - columns: Dict[str, TrainEvalDatasetColumnType] - content_url: URL - metadata: Optional[Dict[str, Any]] = None - - -@json_schema_type -class CreateDatasetRequest(BaseModel): - """Request to create a dataset.""" - - uuid: str - dataset: TrainEvalDataset - - -class Datasets(Protocol): - @webmethod(route="/datasets/create") - def create_dataset( - self, - uuid: str, - dataset: TrainEvalDataset, - ) -> None: ... - - @webmethod(route="/datasets/get") - def get_dataset( - self, - dataset_uuid: str, - ) -> TrainEvalDataset: ... - - @webmethod(route="/datasets/delete") - def delete_dataset( - self, - dataset_uuid: str, - ) -> None: ... diff --git a/llama_stack/apis/datasetio/__init__.py b/llama_stack/apis/datasetio/__init__.py new file mode 100644 index 000000000..378afbba8 --- /dev/null +++ b/llama_stack/apis/datasetio/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .datasetio import * # noqa: F401 F403 diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/apis/datasetio/datasetio.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py similarity index 100% rename from llama_stack/apis/dataset/__init__.py rename to llama_stack/apis/datasets/__init__.py diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py new file mode 100644 index 000000000..addc59952 --- /dev/null +++ b/llama_stack/apis/datasets/datasets.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional, Protocol + +from llama_models.llama3.api.datatypes import URL + +from llama_models.schema_utils import json_schema_type, webmethod + +from pydantic import BaseModel + +from llama_stack.apis.common.type_system import ParamType + + +@json_schema_type +class DatasetSchema(BaseModel): + columns: Dict[str, ParamType] + + +@json_schema_type +class DatasetDef(BaseModel): + identifier: str = Field( + description="A unique name for the dataset", + ) + schema: DatasetSchema = Field( + description="The schema definition for this dataset", + ) + url: URL + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Any additional metadata for this dataset", + ) + + +@json_schema_type +class DatasetDefWithProvider(DatasetDef): + provider_id: str = Field( + description="ID of the provider which serves this dataset", + ) + + +class Datasets(Protocol): + @webmethod(route="/datasets/register", method="POST") + async def register_dataset( + self, + dataset_def: DatasetDefWithProvider, + ) -> None: ... + + @webmethod(route="/datasets/get", method="GET") + async def get_dataset( + self, + dataset_identifier: str, + ) -> Optional[DatasetDefWithProvider]: ... + + @webmethod(route="/datasets/delete") + async def delete_dataset( + self, + dataset_identifier: str, + ) -> None: ... + + @webmethod(route="/datasets/list", method="GET") + async def list_datasets(self) -> List[DatasetDefWithProvider]: ...