datasets api

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
This commit is contained in:
Xi Yan 2025-03-11 14:44:49 -07:00
parent 5f90be5388
commit bc551e6459

View file

@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Protocol from typing import Any, Dict, List, Literal, Optional, Protocol
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -14,9 +15,26 @@ from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
class Schema(Enum):
"""
Schema of the dataset. Each type has a different column format.
:cvar jsonl_messages: The dataset is a JSONL file with messages. Examples:
{
"messages": [
{"role": "user", "content": "Hello, world!"},
{"role": "assistant", "content": "Hello, world!"},
]
}
"""
jsonl_messages = "jsonl_messages"
# TODO: add more schemas here
class CommonDatasetFields(BaseModel): class CommonDatasetFields(BaseModel):
dataset_schema: Dict[str, ParamType] schema: Schema
url: URL uri: str
metadata: Dict[str, Any] = Field( metadata: Dict[str, Any] = Field(
default_factory=dict, default_factory=dict,
description="Any additional metadata for this dataset", description="Any additional metadata for this dataset",
@ -50,13 +68,29 @@ class Datasets(Protocol):
@webmethod(route="/datasets", method="POST") @webmethod(route="/datasets", method="POST")
async def register_dataset( async def register_dataset(
self, self,
dataset_id: str, schema: Schema,
dataset_schema: Dict[str, ParamType], uri: str,
url: URL, uri_params: Optional[Dict[str, Any]] = None,
provider_dataset_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None,
) -> None: ... dataset_id: Optional[str] = None,
) -> Dataset:
"""
Register a new dataset through a file or
:param schema: The schema format of the dataset. One of
- jsonl_messages: The dataset is a JSONL file with messages in column format
:param uri: The URI of the dataset. Examples:
- file://mydata.jsonl
- s3://mybucket/myfile.jsonl
- https://mywebsite.com/myfile.jsonl
- huggingface://tatsu-lab/alpaca
:param uri_params: The parameters for the URI.
- E.g. If URL is a huggingface dataset, parameters could be uri_params={"split": "train"}
:param metadata: The metadata for the dataset.
- E.g. {"description": "My dataset"}
:param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.
"""
...
@webmethod(route="/datasets/{dataset_id:path}", method="GET") @webmethod(route="/datasets/{dataset_id:path}", method="GET")
async def get_dataset( async def get_dataset(