diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 71437bd90..1df7a63a1 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6838,6 +6838,27 @@ ], "title": "Benchmark" }, + "DataReference": { + "oneOf": [ + { + "$ref": "#/components/schemas/URIDataReference" + }, + { + "$ref": "#/components/schemas/HuggingfaceDataReference" + }, + { + "$ref": "#/components/schemas/RowsDataReference" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "uri": "#/components/schemas/URIDataReference", + "huggingface": "#/components/schemas/HuggingfaceDataReference", + "rows": "#/components/schemas/RowsDataReference" + } + } + }, "Dataset": { "type": "object", "properties": { @@ -6856,10 +6877,15 @@ "default": "dataset" }, "schema": { - "$ref": "#/components/schemas/Schema" + "type": "string", + "enum": [ + "jsonl_messages" + ], + "title": "Schema", + "description": "Schema of the dataset. Each type has a different column format." }, - "uri": { - "type": "string" + "data_reference": { + "$ref": "#/components/schemas/DataReference" }, "metadata": { "type": "object", @@ -6894,18 +6920,118 @@ "provider_id", "type", "schema", - "uri", + "data_reference", "metadata" ], "title": "Dataset" }, - "Schema": { - "type": "string", - "enum": [ - "jsonl_messages" + "HuggingfaceDataReference": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "huggingface", + "default": "huggingface" + }, + "dataset_path": { + "type": "string" + }, + "params": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "dataset_path", + "params" ], - "title": "Schema", - "description": "Schema of the dataset. Each type has a different column format." + "title": "HuggingfaceDataReference" + }, + "RowsDataReference": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "rows", + "default": "rows" + }, + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "rows" + ], + "title": "RowsDataReference" + }, + "URIDataReference": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "uri", + "default": "uri" + }, + "uri": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "uri" + ], + "title": "URIDataReference" }, "Model": { "type": "object", @@ -9255,38 +9381,15 @@ "type": "object", "properties": { "schema": { - "$ref": "#/components/schemas/Schema", + "type": "string", + "enum": [ + "jsonl_messages" + ], "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format" }, - "uri": { - "type": "string", - "description": "The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca" - }, - "uri_params": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - }, - "description": "The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters could be uri_params={\"split\": \"train\"}" + "data_reference": { + "$ref": "#/components/schemas/DataReference", + "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }" }, "metadata": { "type": "object", @@ -9322,7 +9425,7 @@ "additionalProperties": false, "required": [ "schema", - "uri" + "data_reference" ], "title": "RegisterDatasetRequest" }, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 5f8d0e522..9d5ed17c7 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4731,6 +4731,17 @@ components: - scoring_functions - metadata title: Benchmark + DataReference: + oneOf: + - $ref: '#/components/schemas/URIDataReference' + - $ref: '#/components/schemas/HuggingfaceDataReference' + - $ref: '#/components/schemas/RowsDataReference' + discriminator: + propertyName: type + mapping: + uri: '#/components/schemas/URIDataReference' + huggingface: '#/components/schemas/HuggingfaceDataReference' + rows: '#/components/schemas/RowsDataReference' Dataset: type: object properties: @@ -4745,9 +4756,14 @@ components: const: dataset default: dataset schema: - $ref: '#/components/schemas/Schema' - uri: type: string + enum: + - jsonl_messages + title: Schema + description: >- + Schema of the dataset. Each type has a different column format. + data_reference: + $ref: '#/components/schemas/DataReference' metadata: type: object additionalProperties: @@ -4765,16 +4781,72 @@ components: - provider_id - type - schema - - uri + - data_reference - metadata title: Dataset - Schema: - type: string - enum: - - jsonl_messages - title: Schema - description: >- - Schema of the dataset. Each type has a different column format. + HuggingfaceDataReference: + type: object + properties: + type: + type: string + const: huggingface + default: huggingface + dataset_path: + type: string + params: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - type + - dataset_path + - params + title: HuggingfaceDataReference + RowsDataReference: + type: object + properties: + type: + type: string + const: rows + default: rows + rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - type + - rows + title: RowsDataReference + URIDataReference: + type: object + properties: + type: + type: string + const: uri + default: uri + uri: + type: string + additionalProperties: false + required: + - type + - uri + title: URIDataReference Model: type: object properties: @@ -6272,28 +6344,20 @@ components: type: object properties: schema: - $ref: '#/components/schemas/Schema' + type: string + enum: + - jsonl_messages description: >- The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format - uri: - type: string + data_reference: + $ref: '#/components/schemas/DataReference' description: >- - The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl - - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca - uri_params: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters - could be uri_params={"split": "train"} + The data reference of the dataset. Examples: - { "type": "uri", "uri": + "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl" + } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params": + { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello, + world!"}] } metadata: type: object additionalProperties: @@ -6313,7 +6377,7 @@ components: additionalProperties: false required: - schema - - uri + - data_reference title: RegisterDatasetRequest RegisterModelRequest: type: object diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 157431ed2..049b6e8be 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -5,12 +5,12 @@ # the root directory of this source tree. from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Protocol +from typing import Any, Dict, List, Literal, Optional, Protocol, Annotated, Union from pydantic import BaseModel, Field from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.schema_utils import json_schema_type, webmethod +from llama_stack.schema_utils import json_schema_type, webmethod, register_schema class Schema(Enum): @@ -29,9 +29,42 @@ class Schema(Enum): # TODO: add more schemas here +class DatasetType(Enum): + huggingface = "huggingface" + uri = "uri" + rows = "rows" + + +@json_schema_type +class URIDataReference(BaseModel): + type: Literal["uri"] = "uri" + uri: str + + +@json_schema_type +class HuggingfaceDataReference(BaseModel): + type: Literal["huggingface"] = "huggingface" + dataset_path: str + params: Dict[str, Any] + + +@json_schema_type +class RowsDataReference(BaseModel): + type: Literal["rows"] = "rows" + rows: List[Dict[str, Any]] + + +DataReference = register_schema( + Annotated[ + Union[URIDataReference, HuggingfaceDataReference, RowsDataReference], + Field(discriminator="type"), + ], + name="DataReference", +) + class CommonDatasetFields(BaseModel): schema: Schema - uri: str + data_reference: DataReference metadata: Dict[str, Any] = Field( default_factory=dict, description="Any additional metadata for this dataset", @@ -66,8 +99,7 @@ class Datasets(Protocol): async def register_dataset( self, schema: Schema, - uri: str, - uri_params: Optional[Dict[str, Any]] = None, + data_reference: DataReference, metadata: Optional[Dict[str, Any]] = None, dataset_id: Optional[str] = None, ) -> Dataset: @@ -76,13 +108,26 @@ class Datasets(Protocol): :param schema: The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format - :param uri: The URI of the dataset. Examples: - - file://mydata.jsonl - - s3://mybucket/myfile.jsonl - - https://mywebsite.com/myfile.jsonl - - huggingface://tatsu-lab/alpaca - :param uri_params: The parameters for the URI. - - E.g. If URL is a huggingface dataset, parameters could be uri_params={"split": "train"} + :param data_reference: The data reference of the dataset. Examples: + - { + "type": "uri", + "uri": "https://mywebsite.com/mydata.jsonl" + } + - { + "type": "uri", + "uri": "lsfs://mydata.jsonl" + } + - { + "type": "huggingface", + "dataset_path": "tatsu-lab/alpaca", + "params": { + "split": "train" + } + } + - { + "type": "rows", + "rows": [{"message": "Hello, world!"}] + } :param metadata: The metadata for the dataset. - E.g. {"description": "My dataset"} :param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.