From 0abedd070cb7b54be95b77bff5ba2cf7537ee0e4 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 00:13:27 -0700 Subject: [PATCH] comment --- docs/_static/llama-stack-spec.html | 46 +++++++++++----------- docs/_static/llama-stack-spec.yaml | 55 ++++++++++++++------------- llama_stack/apis/datasets/datasets.py | 24 ++++++------ 3 files changed, 63 insertions(+), 62 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 1df7a63a1..5044594f3 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2518,7 +2518,7 @@ "tags": [ "Datasets" ], - "description": "Register a new dataset through a file or", + "description": "Register a new dataset.", "parameters": [], "requestBody": { "content": { @@ -6838,24 +6838,24 @@ ], "title": "Benchmark" }, - "DataReference": { + "DataSource": { "oneOf": [ { - "$ref": "#/components/schemas/URIDataReference" + "$ref": "#/components/schemas/URIDataSource" }, { - "$ref": "#/components/schemas/HuggingfaceDataReference" + "$ref": "#/components/schemas/HuggingfaceDataSource" }, { - "$ref": "#/components/schemas/RowsDataReference" + "$ref": "#/components/schemas/RowsDataSource" } ], "discriminator": { "propertyName": "type", "mapping": { - "uri": "#/components/schemas/URIDataReference", - "huggingface": "#/components/schemas/HuggingfaceDataReference", - "rows": "#/components/schemas/RowsDataReference" + "uri": "#/components/schemas/URIDataSource", + "huggingface": "#/components/schemas/HuggingfaceDataSource", + "rows": "#/components/schemas/RowsDataSource" } } }, @@ -6879,13 +6879,13 @@ "schema": { "type": "string", "enum": [ - "jsonl_messages" + "messages" ], "title": "Schema", "description": "Schema of the dataset. Each type has a different column format." }, - "data_reference": { - "$ref": "#/components/schemas/DataReference" + "data_source": { + "$ref": "#/components/schemas/DataSource" }, "metadata": { "type": "object", @@ -6920,12 +6920,12 @@ "provider_id", "type", "schema", - "data_reference", + "data_source", "metadata" ], "title": "Dataset" }, - "HuggingfaceDataReference": { + "HuggingfaceDataSource": { "type": "object", "properties": { "type": { @@ -6968,9 +6968,9 @@ "dataset_path", "params" ], - "title": "HuggingfaceDataReference" + "title": "HuggingfaceDataSource" }, - "RowsDataReference": { + "RowsDataSource": { "type": "object", "properties": { "type": { @@ -7012,9 +7012,9 @@ "type", "rows" ], - "title": "RowsDataReference" + "title": "RowsDataSource" }, - "URIDataReference": { + "URIDataSource": { "type": "object", "properties": { "type": { @@ -7031,7 +7031,7 @@ "type", "uri" ], - "title": "URIDataReference" + "title": "URIDataSource" }, "Model": { "type": "object", @@ -9383,13 +9383,13 @@ "schema": { "type": "string", "enum": [ - "jsonl_messages" + "messages" ], "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format" }, - "data_reference": { - "$ref": "#/components/schemas/DataReference", - "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }" + "data_source": { + "$ref": "#/components/schemas/DataSource", + "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" }, "metadata": { "type": "object", @@ -9425,7 +9425,7 @@ "additionalProperties": false, "required": [ "schema", - "data_reference" + "data_source" ], "title": "RegisterDatasetRequest" }, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 9d5ed17c7..692d537c4 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1698,7 +1698,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: Register a new dataset through a file or + description: Register a new dataset. parameters: [] requestBody: content: @@ -4731,17 +4731,17 @@ components: - scoring_functions - metadata title: Benchmark - DataReference: + DataSource: oneOf: - - $ref: '#/components/schemas/URIDataReference' - - $ref: '#/components/schemas/HuggingfaceDataReference' - - $ref: '#/components/schemas/RowsDataReference' + - $ref: '#/components/schemas/URIDataSource' + - $ref: '#/components/schemas/HuggingfaceDataSource' + - $ref: '#/components/schemas/RowsDataSource' discriminator: propertyName: type mapping: - uri: '#/components/schemas/URIDataReference' - huggingface: '#/components/schemas/HuggingfaceDataReference' - rows: '#/components/schemas/RowsDataReference' + uri: '#/components/schemas/URIDataSource' + huggingface: '#/components/schemas/HuggingfaceDataSource' + rows: '#/components/schemas/RowsDataSource' Dataset: type: object properties: @@ -4758,12 +4758,12 @@ components: schema: type: string enum: - - jsonl_messages + - messages title: Schema description: >- Schema of the dataset. Each type has a different column format. - data_reference: - $ref: '#/components/schemas/DataReference' + data_source: + $ref: '#/components/schemas/DataSource' metadata: type: object additionalProperties: @@ -4781,10 +4781,10 @@ components: - provider_id - type - schema - - data_reference + - data_source - metadata title: Dataset - HuggingfaceDataReference: + HuggingfaceDataSource: type: object properties: type: @@ -4808,8 +4808,8 @@ components: - type - dataset_path - params - title: HuggingfaceDataReference - RowsDataReference: + title: HuggingfaceDataSource + RowsDataSource: type: object properties: type: @@ -4832,8 +4832,8 @@ components: required: - type - rows - title: RowsDataReference - URIDataReference: + title: RowsDataSource + URIDataSource: type: object properties: type: @@ -4846,7 +4846,7 @@ components: required: - type - uri - title: URIDataReference + title: URIDataSource Model: type: object properties: @@ -6346,18 +6346,19 @@ components: schema: type: string enum: - - jsonl_messages + - messages description: >- The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format - data_reference: - $ref: '#/components/schemas/DataReference' + data_source: + $ref: '#/components/schemas/DataSource' description: >- - The data reference of the dataset. Examples: - { "type": "uri", "uri": - "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl" - } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params": - { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello, - world!"}] } + The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" + } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface", + "dataset_path": "tatsu-lab/alpaca", "params": { "split": "train" } } - + { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content": + "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] + } ] } metadata: type: object additionalProperties: @@ -6377,7 +6378,7 @@ components: additionalProperties: false required: - schema - - data_reference + - data_source title: RegisterDatasetRequest RegisterModelRequest: type: object diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index f20edca31..5693c86ed 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class Schema(Enum): """ Schema of the dataset. Each type has a different column format. - :cvar jsonl_messages: The dataset is a JSONL file with messages. Examples: + :cvar messages: The dataset contains messages used for post-training. Examples: { "messages": [ {"role": "user", "content": "Hello, world!"}, @@ -25,7 +25,7 @@ class Schema(Enum): } """ - jsonl_messages = "jsonl_messages" + messages = "messages" # TODO: add more schemas here @@ -36,36 +36,36 @@ class DatasetType(Enum): @json_schema_type -class URIDataReference(BaseModel): +class URIDataSource(BaseModel): type: Literal["uri"] = "uri" uri: str @json_schema_type -class HuggingfaceDataReference(BaseModel): +class HuggingfaceDataSource(BaseModel): type: Literal["huggingface"] = "huggingface" dataset_path: str params: Dict[str, Any] @json_schema_type -class RowsDataReference(BaseModel): +class RowsDataSource(BaseModel): type: Literal["rows"] = "rows" rows: List[Dict[str, Any]] -DataReference = register_schema( +DataSource = register_schema( Annotated[ - Union[URIDataReference, HuggingfaceDataReference, RowsDataReference], + Union[URIDataSource, HuggingfaceDataSource, RowsDataSource], Field(discriminator="type"), ], - name="DataReference", + name="DataSource", ) class CommonDatasetFields(BaseModel): schema: Schema - data_reference: DataReference + data_source: DataSource metadata: Dict[str, Any] = Field( default_factory=dict, description="Any additional metadata for this dataset", @@ -100,16 +100,16 @@ class Datasets(Protocol): async def register_dataset( self, schema: Schema, - data_reference: DataReference, + data_source: DataSource, metadata: Optional[Dict[str, Any]] = None, dataset_id: Optional[str] = None, ) -> Dataset: """ - Register a new dataset through a file or + Register a new dataset. :param schema: The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format - :param data_reference: The data reference of the dataset. Examples: + :param data_source: The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"