diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index a698c2c9c..7e12b157f 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2518,7 +2518,7 @@ "tags": [ "Datasets" ], - "description": "Register a new dataset through a file or", + "description": "Register a new dataset.", "parameters": [], "requestBody": { "content": { @@ -7144,24 +7144,24 @@ ], "title": "Benchmark" }, - "DataReference": { + "DataSource": { "oneOf": [ { - "$ref": "#/components/schemas/URIDataReference" + "$ref": "#/components/schemas/URIDataSource" }, { - "$ref": "#/components/schemas/HuggingfaceDataReference" + "$ref": "#/components/schemas/HuggingfaceDataSource" }, { - "$ref": "#/components/schemas/RowsDataReference" + "$ref": "#/components/schemas/RowsDataSource" } ], "discriminator": { "propertyName": "type", "mapping": { - "uri": "#/components/schemas/URIDataReference", - "huggingface": "#/components/schemas/HuggingfaceDataReference", - "rows": "#/components/schemas/RowsDataReference" + "uri": "#/components/schemas/URIDataSource", + "huggingface": "#/components/schemas/HuggingfaceDataSource", + "rows": "#/components/schemas/RowsDataSource" } } }, @@ -7185,13 +7185,13 @@ "schema": { "type": "string", "enum": [ - "jsonl_messages" + "messages" ], "title": "Schema", "description": "Schema of the dataset. Each type has a different column format." }, - "data_reference": { - "$ref": "#/components/schemas/DataReference" + "data_source": { + "$ref": "#/components/schemas/DataSource" }, "metadata": { "type": "object", @@ -7226,12 +7226,12 @@ "provider_id", "type", "schema", - "data_reference", + "data_source", "metadata" ], "title": "Dataset" }, - "HuggingfaceDataReference": { + "HuggingfaceDataSource": { "type": "object", "properties": { "type": { @@ -7274,9 +7274,9 @@ "dataset_path", "params" ], - "title": "HuggingfaceDataReference" + "title": "HuggingfaceDataSource" }, - "RowsDataReference": { + "RowsDataSource": { "type": "object", "properties": { "type": { @@ -7318,9 +7318,9 @@ "type", "rows" ], - "title": "RowsDataReference" + "title": "RowsDataSource" }, - "URIDataReference": { + "URIDataSource": { "type": "object", "properties": { "type": { @@ -7337,7 +7337,7 @@ "type", "uri" ], - "title": "URIDataReference" + "title": "URIDataSource" }, "Model": { "type": "object", @@ -9506,9 +9506,9 @@ "schema": { "type": "string", "enum": [ - "jsonl_messages" + "messages" ], - "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format" + "description": "The schema format of the dataset. One of - messages: The dataset contains a messages column with list of messages for post-training." }, "data_reference": { "$ref": "#/components/schemas/DataReference", @@ -9548,7 +9548,7 @@ "additionalProperties": false, "required": [ "schema", - "data_reference" + "data_source" ], "title": "RegisterDatasetRequest" }, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 2fe35cc2c..01637f59c 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1698,7 +1698,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: Register a new dataset through a file or + description: Register a new dataset. parameters: [] requestBody: content: @@ -4974,17 +4974,17 @@ components: - scoring_functions - metadata title: Benchmark - DataReference: + DataSource: oneOf: - - $ref: '#/components/schemas/URIDataReference' - - $ref: '#/components/schemas/HuggingfaceDataReference' - - $ref: '#/components/schemas/RowsDataReference' + - $ref: '#/components/schemas/URIDataSource' + - $ref: '#/components/schemas/HuggingfaceDataSource' + - $ref: '#/components/schemas/RowsDataSource' discriminator: propertyName: type mapping: - uri: '#/components/schemas/URIDataReference' - huggingface: '#/components/schemas/HuggingfaceDataReference' - rows: '#/components/schemas/RowsDataReference' + uri: '#/components/schemas/URIDataSource' + huggingface: '#/components/schemas/HuggingfaceDataSource' + rows: '#/components/schemas/RowsDataSource' Dataset: type: object properties: @@ -5001,12 +5001,12 @@ components: schema: type: string enum: - - jsonl_messages + - messages title: Schema description: >- Schema of the dataset. Each type has a different column format. - data_reference: - $ref: '#/components/schemas/DataReference' + data_source: + $ref: '#/components/schemas/DataSource' metadata: type: object additionalProperties: @@ -5024,10 +5024,10 @@ components: - provider_id - type - schema - - data_reference + - data_source - metadata title: Dataset - HuggingfaceDataReference: + HuggingfaceDataSource: type: object properties: type: @@ -5051,8 +5051,8 @@ components: - type - dataset_path - params - title: HuggingfaceDataReference - RowsDataReference: + title: HuggingfaceDataSource + RowsDataSource: type: object properties: type: @@ -5075,8 +5075,8 @@ components: required: - type - rows - title: RowsDataReference - URIDataReference: + title: RowsDataSource + URIDataSource: type: object properties: type: @@ -5089,7 +5089,7 @@ components: required: - type - uri - title: URIDataReference + title: URIDataSource Model: type: object properties: @@ -6472,12 +6472,12 @@ components: schema: type: string enum: - - jsonl_messages + - messages description: >- - The schema format of the dataset. One of - jsonl_messages: The dataset - is a JSONL file with messages in column format - data_reference: - $ref: '#/components/schemas/DataReference' + The schema format of the dataset. One of - messages: The dataset contains + a messages column with list of messages for post-training. + data_source: + $ref: '#/components/schemas/DataSource' description: >- The data reference of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl" @@ -6504,7 +6504,7 @@ components: additionalProperties: false required: - schema - - data_reference + - data_source title: RegisterDatasetRequest RegisterModelRequest: type: object diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index f20edca31..4b3ce3e6f 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class Schema(Enum): """ Schema of the dataset. Each type has a different column format. - :cvar jsonl_messages: The dataset is a JSONL file with messages. Examples: + :cvar messages: The dataset contains messages used for post-training. Examples: { "messages": [ {"role": "user", "content": "Hello, world!"}, @@ -25,7 +25,7 @@ class Schema(Enum): } """ - jsonl_messages = "jsonl_messages" + messages = "messages" # TODO: add more schemas here @@ -36,36 +36,36 @@ class DatasetType(Enum): @json_schema_type -class URIDataReference(BaseModel): +class URIDataSource(BaseModel): type: Literal["uri"] = "uri" uri: str @json_schema_type -class HuggingfaceDataReference(BaseModel): +class HuggingfaceDataSource(BaseModel): type: Literal["huggingface"] = "huggingface" dataset_path: str params: Dict[str, Any] @json_schema_type -class RowsDataReference(BaseModel): +class RowsDataSource(BaseModel): type: Literal["rows"] = "rows" rows: List[Dict[str, Any]] -DataReference = register_schema( +DataSource = register_schema( Annotated[ - Union[URIDataReference, HuggingfaceDataReference, RowsDataReference], + Union[URIDataSource, HuggingfaceDataSource, RowsDataSource], Field(discriminator="type"), ], - name="DataReference", + name="DataSource", ) class CommonDatasetFields(BaseModel): schema: Schema - data_reference: DataReference + data_source: DataSource metadata: Dict[str, Any] = Field( default_factory=dict, description="Any additional metadata for this dataset", @@ -100,16 +100,16 @@ class Datasets(Protocol): async def register_dataset( self, schema: Schema, - data_reference: DataReference, + data_source: DataSource, metadata: Optional[Dict[str, Any]] = None, dataset_id: Optional[str] = None, ) -> Dataset: """ - Register a new dataset through a file or + Register a new dataset. :param schema: The schema format of the dataset. One of - - jsonl_messages: The dataset is a JSONL file with messages in column format - :param data_reference: The data reference of the dataset. Examples: + - messages: The dataset contains a messages column with list of messages for post-training. + :param data_source: The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"