diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 821e5ed53..856c6e715 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6846,13 +6846,14 @@ "const": "dataset", "default": "dataset" }, - "schema": { + "purpose": { "type": "string", "enum": [ - "messages" + "post-training/messages", + "eval/question-answer" ], - "title": "Schema", - "description": "Schema of the dataset. Each type has a different column format." + "title": "DatasetPurpose", + "description": "Purpose of the dataset. Each type has a different column format." }, "data_source": { "$ref": "#/components/schemas/DataSource" @@ -6889,7 +6890,7 @@ "provider_resource_id", "provider_id", "type", - "schema", + "purpose", "data_source", "metadata" ], @@ -6903,8 +6904,9 @@ "const": "huggingface", "default": "huggingface" }, - "dataset_path": { - "type": "string" + "path": { + "type": "string", + "description": "The path to the dataset in Huggingface. E.g. - \"llamastack/simpleqa\"" }, "params": { "type": "object", @@ -6929,16 +6931,18 @@ "type": "object" } ] - } + }, + "description": "The parameters for the dataset." } }, "additionalProperties": false, "required": [ "type", - "dataset_path", + "path", "params" ], - "title": "HuggingfaceDataSource" + "title": "HuggingfaceDataSource", + "description": "A dataset stored in Huggingface." }, "RowsDataSource": { "type": "object", @@ -6974,7 +6978,8 @@ } ] } - } + }, + "description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]" } }, "additionalProperties": false, @@ -6982,7 +6987,8 @@ "type", "rows" ], - "title": "RowsDataSource" + "title": "RowsDataSource", + "description": "A dataset stored in rows." }, "URIDataSource": { "type": "object", @@ -6993,7 +6999,8 @@ "default": "uri" }, "uri": { - "type": "string" + "type": "string", + "description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\"" } }, "additionalProperties": false, @@ -7001,7 +7008,8 @@ "type", "uri" ], - "title": "URIDataSource" + "title": "URIDataSource", + "description": "A dataset that can be obtained from a URI." }, "Model": { "type": "object", @@ -9419,14 +9427,15 @@ "RegisterDatasetRequest": { "type": "object", "properties": { - "schema": { + "purpose": { "type": "string", "enum": [ - "messages" + "post-training/messages", + "eval/question-answer" ], - "description": "The schema format of the dataset. One of - messages: The dataset contains a messages column with list of messages for post-training." + "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column." }, - "data_source": { + "source": { "$ref": "#/components/schemas/DataSource", "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" }, @@ -9463,8 +9472,8 @@ }, "additionalProperties": false, "required": [ - "schema", - "data_source" + "purpose", + "source" ], "title": "RegisterDatasetRequest" }, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 21625827a..93ba4ba30 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4738,13 +4738,14 @@ components: type: string const: dataset default: dataset - schema: + purpose: type: string enum: - - messages - title: Schema + - post-training/messages + - eval/question-answer + title: DatasetPurpose description: >- - Schema of the dataset. Each type has a different column format. + Purpose of the dataset. Each type has a different column format. data_source: $ref: '#/components/schemas/DataSource' metadata: @@ -4763,7 +4764,7 @@ components: - provider_resource_id - provider_id - type - - schema + - purpose - data_source - metadata title: Dataset @@ -4774,8 +4775,10 @@ components: type: string const: huggingface default: huggingface - dataset_path: + path: type: string + description: >- + The path to the dataset in Huggingface. E.g. - "llamastack/simpleqa" params: type: object additionalProperties: @@ -4786,12 +4789,14 @@ components: - type: string - type: array - type: object + description: The parameters for the dataset. additionalProperties: false required: - type - - dataset_path + - path - params title: HuggingfaceDataSource + description: A dataset stored in Huggingface. RowsDataSource: type: object properties: @@ -4811,11 +4816,16 @@ components: - type: string - type: array - type: object + description: >- + The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user", + "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, + world!"}]} ] additionalProperties: false required: - type - rows title: RowsDataSource + description: A dataset stored in rows. URIDataSource: type: object properties: @@ -4825,11 +4835,16 @@ components: default: uri uri: type: string + description: >- + The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl" + - "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}" additionalProperties: false required: - type - uri title: URIDataSource + description: >- + A dataset that can be obtained from a URI. Model: type: object properties: @@ -6367,14 +6382,16 @@ components: RegisterDatasetRequest: type: object properties: - schema: + purpose: type: string enum: - - messages + - post-training/messages + - eval/question-answer description: >- - The schema format of the dataset. One of - messages: The dataset contains - a messages column with list of messages for post-training. - data_source: + The purpose of the dataset. One of - "post-training/messages": The dataset + contains a messages column with list of messages for post-training. - + "eval/question-answer": The dataset contains a question and answer column. + source: $ref: '#/components/schemas/DataSource' description: >- The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" @@ -6401,8 +6418,8 @@ components: The ID of the dataset. If not provided, a random ID will be generated. additionalProperties: false required: - - schema - - data_source + - purpose + - source title: RegisterDatasetRequest RegisterModelRequest: type: object diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index b18dd204b..26ad85422 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class DatasetPurpose(Enum): """ Purpose of the dataset. Each type has a different column format. - :cvar tuning/messages: The dataset contains messages used for post-training. Examples: + :cvar post-training/messages: The dataset contains messages used for post-training. Examples: { "messages": [ {"role": "user", "content": "Hello, world!"}, @@ -25,12 +25,19 @@ class DatasetPurpose(Enum): } """ - tuning_messages = "tuning/messages" + post_training_messages = "post-training/messages" + eval_question_answer = "eval/question-answer" # TODO: add more schemas here class DatasetType(Enum): + """ + Type of the dataset source. + :cvar huggingface: The dataset is stored in Huggingface. + :cvar uri: The dataset can be obtained from a URI. + :cvar rows: The dataset is stored in rows. + """ huggingface = "huggingface" uri = "uri" rows = "rows" @@ -38,19 +45,36 @@ class DatasetType(Enum): @json_schema_type class URIDataSource(BaseModel): + """A dataset that can be obtained from a URI. + :param uri: The dataset can be obtained from a URI. E.g. + - "https://mywebsite.com/mydata.jsonl" + - "lsfs://mydata.jsonl" + - "data:csv;base64,{base64_content}" + """ type: Literal["uri"] = "uri" uri: str @json_schema_type class HuggingfaceDataSource(BaseModel): + """A dataset stored in Huggingface. + :param path: The path to the dataset in Huggingface. E.g. + - "llamastack/simpleqa" + :param params: The parameters for the dataset. + """ type: Literal["huggingface"] = "huggingface" - dataset_path: str + path: str params: Dict[str, Any] @json_schema_type class RowsDataSource(BaseModel): + """A dataset stored in rows. + :param rows: The dataset is stored in rows. E.g. + - [ + {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]} + ] + """ type: Literal["rows"] = "rows" rows: List[Dict[str, Any]] @@ -65,7 +89,10 @@ DataSource = register_schema( class CommonDatasetFields(BaseModel): - schema: Schema + """ + Common fields for a dataset. + """ + purpose: DatasetPurpose data_source: DataSource metadata: Dict[str, Any] = Field( default_factory=dict, @@ -108,9 +135,10 @@ class Datasets(Protocol): """ Register a new dataset. - :param schema: The schema format of the dataset. One of - - messages: The dataset contains a messages column with list of messages for post-training. - :param data_source: The data source of the dataset. Examples: + :param purpose: The purpose of the dataset. One of + - "post-training/messages": The dataset contains a messages column with list of messages for post-training. + - "eval/question-answer": The dataset contains a question and answer column. + :param source: The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"