diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 3bbfea0b8..43ffd0bab 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6811,10 +6811,10 @@ "type": "string", "enum": [ "post-training/messages", - "eval/question-answer" + "eval/messages-answer" ], "title": "DatasetPurpose", - "description": "Purpose of the dataset. Each type has a different column format." + "description": "Purpose of the dataset. Each purpose has a required input data schema." }, "source": { "$ref": "#/components/schemas/DataSource" @@ -9885,13 +9885,13 @@ "type": "string", "enum": [ "post-training/messages", - "eval/question-answer" + "eval/messages-answer" ], - "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column." + "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column." }, "source": { "$ref": "#/components/schemas/DataSource", - "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" + "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"huggingface\": { \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" }, "metadata": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index dc434ffa7..aa353b432 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4716,10 +4716,10 @@ components: type: string enum: - post-training/messages - - eval/question-answer + - eval/messages-answer title: DatasetPurpose description: >- - Purpose of the dataset. Each type has a different column format. + Purpose of the dataset. Each purpose has a required input data schema. source: $ref: '#/components/schemas/DataSource' metadata: @@ -6776,20 +6776,21 @@ components: type: string enum: - post-training/messages - - eval/question-answer + - eval/messages-answer description: >- The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - "eval/question-answer": The dataset contains a question and answer column. + "eval/messages-answer": The dataset contains a messages column with list + of messages and an answer column. source: $ref: '#/components/schemas/DataSource' description: >- The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface", - "dataset_path": "tatsu-lab/alpaca", "params": { "split": "train" } } - - { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content": - "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] - } ] } + "huggingface": { "dataset_path": "tatsu-lab/alpaca", "params": { "split": + "train" } } } - { "type": "rows", "rows": [ { "messages": [ {"role": "user", + "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, + world!"}, ] } ] } metadata: type: object additionalProperties: diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 20587a29e..9ec05a213 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -15,18 +15,26 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class DatasetPurpose(Enum): """ - Purpose of the dataset. Each type has a different column format. - :cvar post-training/messages: The dataset contains messages used for post-training. Examples: + Purpose of the dataset. Each purpose has a required input data schema. + + :cvar post-training/messages: The dataset contains messages used for post-training. { "messages": [ {"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } + :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column. + { + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + ], + "answer": "Paris" + } """ post_training_messages = "post-training/messages" - eval_question_answer = "eval/question-answer" + eval_messages_answer = "eval/messages-answer" # TODO: add more schemas here @@ -145,7 +153,7 @@ class Datasets(Protocol): :param purpose: The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - "eval/question-answer": The dataset contains a question and answer column. + - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column. :param source: The data source of the dataset. Examples: - { "type": "uri", @@ -157,9 +165,11 @@ class Datasets(Protocol): } - { "type": "huggingface", - "dataset_path": "tatsu-lab/alpaca", - "params": { - "split": "train" + "huggingface": { + "dataset_path": "tatsu-lab/alpaca", + "params": { + "split": "train" + } } } - {