From 4f6f0f6a9101619e86d90e708f00ef86c8588283 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 23:27:01 -0700 Subject: [PATCH 1/4] update doc --- docs/_static/llama-stack-spec.html | 2 +- docs/_static/llama-stack-spec.yaml | 8 ++++---- llama_stack/apis/datasets/datasets.py | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 44459f2b9..f7166bb65 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -9448,7 +9448,7 @@ }, "source": { "$ref": "#/components/schemas/DataSource", - "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" + "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"huggingface\": { \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" }, "metadata": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a257b2a7d..10db07f02 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6404,10 +6404,10 @@ components: description: >- The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface", - "dataset_path": "tatsu-lab/alpaca", "params": { "split": "train" } } - - { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content": - "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] - } ] } + "huggingface": { "dataset_path": "tatsu-lab/alpaca", "params": { "split": + "train" } } } - { "type": "rows", "rows": [ { "messages": [ {"role": "user", + "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, + world!"}, ] } ] } metadata: type: object additionalProperties: diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 20587a29e..71118667f 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -23,6 +23,13 @@ class DatasetPurpose(Enum): {"role": "assistant", "content": "Hello, world!"}, ] } + :cvar eval/question-answer: The dataset contains a question and answer column. + { + "question": [ + {"role": "user", "content": "What is the capital of France?"}, + ], + "answer": "Paris" + } """ post_training_messages = "post-training/messages" @@ -157,9 +164,11 @@ class Datasets(Protocol): } - { "type": "huggingface", - "dataset_path": "tatsu-lab/alpaca", - "params": { - "split": "train" + "huggingface": { + "dataset_path": "tatsu-lab/alpaca", + "params": { + "split": "train" + } } } - { From 772339bebfe32b18ff6549a36fd2b925fef9d572 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 23:27:45 -0700 Subject: [PATCH 2/4] update doc --- llama_stack/apis/datasets/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 71118667f..cfbd6b4ac 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -15,7 +15,8 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class DatasetPurpose(Enum): """ - Purpose of the dataset. Each type has a different column format. + Purpose of the dataset. Each purpose has a required input data schema. + :cvar post-training/messages: The dataset contains messages used for post-training. Examples: { "messages": [ From b4d118fc5c58c335073e0ed633d84744878c3d58 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 23:30:47 -0700 Subject: [PATCH 3/4] update doc --- docs/_static/llama-stack-spec.html | 8 ++++---- docs/_static/llama-stack-spec.yaml | 12 ++++++++---- llama_stack/apis/datasets/datasets.py | 26 ++++++++++++++++++++------ 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index f7166bb65..fc213b719 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6850,10 +6850,10 @@ "type": "string", "enum": [ "post-training/messages", - "eval/question-answer" + "eval/messages-answer" ], "title": "DatasetPurpose", - "description": "Purpose of the dataset. Each type has a different column format." + "description": "Purpose of the dataset. Each purpose has a required input data schema." }, "source": { "$ref": "#/components/schemas/DataSource" @@ -9442,9 +9442,9 @@ "type": "string", "enum": [ "post-training/messages", - "eval/question-answer" + "eval/messages-answer" ], - "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column." + "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - Example data rows: { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column. - Example data rows: { \"messages\": [ {\"role\": \"user\", \"content\": \"What is the capital of France?\"}, ], \"answer\": \"Paris\" }" }, "source": { "$ref": "#/components/schemas/DataSource", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 10db07f02..e3355fc78 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4742,10 +4742,10 @@ components: type: string enum: - post-training/messages - - eval/question-answer + - eval/messages-answer title: DatasetPurpose description: >- - Purpose of the dataset. Each type has a different column format. + Purpose of the dataset. Each purpose has a required input data schema. source: $ref: '#/components/schemas/DataSource' metadata: @@ -6394,11 +6394,15 @@ components: type: string enum: - post-training/messages - - eval/question-answer + - eval/messages-answer description: >- The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - "eval/question-answer": The dataset contains a question and answer column. + Example data rows: { "messages": [ {"role": "user", "content": "Hello, + world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } - "eval/messages-answer": + The dataset contains a messages column with list of messages and an answer + column. - Example data rows: { "messages": [ {"role": "user", "content": + "What is the capital of France?"}, ], "answer": "Paris" } source: $ref: '#/components/schemas/DataSource' description: >- diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index cfbd6b4ac..a731da6ba 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -16,17 +16,17 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho class DatasetPurpose(Enum): """ Purpose of the dataset. Each purpose has a required input data schema. - - :cvar post-training/messages: The dataset contains messages used for post-training. Examples: + + :cvar post-training/messages: The dataset contains messages used for post-training. { "messages": [ {"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } - :cvar eval/question-answer: The dataset contains a question and answer column. + :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column. { - "question": [ + "messages": [ {"role": "user", "content": "What is the capital of France?"}, ], "answer": "Paris" @@ -34,7 +34,7 @@ class DatasetPurpose(Enum): """ post_training_messages = "post-training/messages" - eval_question_answer = "eval/question-answer" + eval_messages_answer = "eval/messages-answer" # TODO: add more schemas here @@ -153,7 +153,21 @@ class Datasets(Protocol): :param purpose: The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - "eval/question-answer": The dataset contains a question and answer column. + - Example data rows: + { + "messages": [ + {"role": "user", "content": "Hello, world!"}, + {"role": "assistant", "content": "Hello, world!"}, + ] + } + - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column. + - Example data rows: + { + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + ], + "answer": "Paris" + } :param source: The data source of the dataset. Examples: - { "type": "uri", From 0df33049e3cef90f0e49410890926dd4d4a1107b Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 23:32:54 -0700 Subject: [PATCH 4/4] update doc --- docs/_static/llama-stack-spec.html | 2 +- docs/_static/llama-stack-spec.yaml | 7 ++----- llama_stack/apis/datasets/datasets.py | 14 -------------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index fc213b719..247a15af4 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -9444,7 +9444,7 @@ "post-training/messages", "eval/messages-answer" ], - "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - Example data rows: { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column. - Example data rows: { \"messages\": [ {\"role\": \"user\", \"content\": \"What is the capital of France?\"}, ], \"answer\": \"Paris\" }" + "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column." }, "source": { "$ref": "#/components/schemas/DataSource", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index e3355fc78..97e0787ee 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6398,11 +6398,8 @@ components: description: >- The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - Example data rows: { "messages": [ {"role": "user", "content": "Hello, - world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } - "eval/messages-answer": - The dataset contains a messages column with list of messages and an answer - column. - Example data rows: { "messages": [ {"role": "user", "content": - "What is the capital of France?"}, ], "answer": "Paris" } + "eval/messages-answer": The dataset contains a messages column with list + of messages and an answer column. source: $ref: '#/components/schemas/DataSource' description: >- diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index a731da6ba..9ec05a213 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -153,21 +153,7 @@ class Datasets(Protocol): :param purpose: The purpose of the dataset. One of - "post-training/messages": The dataset contains a messages column with list of messages for post-training. - - Example data rows: - { - "messages": [ - {"role": "user", "content": "Hello, world!"}, - {"role": "assistant", "content": "Hello, world!"}, - ] - } - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column. - - Example data rows: - { - "messages": [ - {"role": "user", "content": "What is the capital of France?"}, - ], - "answer": "Paris" - } :param source: The data source of the dataset. Examples: - { "type": "uri",