Merge branch 'pr1573' into api_2

2025-12-31 01:30:00 +00:00 · 2025-03-12 23:36:03 -07:00 · 2025-03-12 23:36:03 -07:00 · f90dcd2a69
commit f90dcd2a69
parent 25710c3b8a 0df33049e3
3 changed files with 31 additions and 20 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6811,10 +6811,10 @@
                        "type": "string",
                        "enum": [
                            "post-training/messages",
-                            "eval/question-answer"
+                            "eval/messages-answer"
                        ],
                        "title": "DatasetPurpose",
-                        "description": "Purpose of the dataset. Each type has a different column format."
+                        "description": "Purpose of the dataset. Each purpose has a required input data schema."
                    },
                    "source": {
                        "$ref": "#/components/schemas/DataSource"
@ -9885,13 +9885,13 @@
                        "type": "string",
                        "enum": [
                            "post-training/messages",
-                            "eval/question-answer"
+                            "eval/messages-answer"
                        ],
-                        "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column."
+                        "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column."
                    },
                    "source": {
                        "$ref": "#/components/schemas/DataSource",
-                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
+                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"huggingface\": { \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
                    },
                    "metadata": {
                        "type": "object",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4716,10 +4716,10 @@ components:
          type: string
          enum:
            - post-training/messages
-            - eval/question-answer
+            - eval/messages-answer
          title: DatasetPurpose
          description: >-
-            Purpose of the dataset. Each type has a different column format.
+            Purpose of the dataset. Each purpose has a required input data schema.
        source:
          $ref: '#/components/schemas/DataSource'
        metadata:
@ -6776,20 +6776,21 @@ components:
          type: string
          enum:
            - post-training/messages
-            - eval/question-answer
+            - eval/messages-answer
          description: >-
            The purpose of the dataset. One of - "post-training/messages": The dataset
            contains a messages column with list of messages for post-training. -
-            "eval/question-answer": The dataset contains a question and answer column.
+            "eval/messages-answer": The dataset contains a messages column with list
+            of messages and an answer column.
        source:
          $ref: '#/components/schemas/DataSource'
          description: >-
            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface",
-            "dataset_path": "tatsu-lab/alpaca", "params": { "split": "train" } } -
-            { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content":
-            "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ]
-            } ] }
+            "huggingface": { "dataset_path": "tatsu-lab/alpaca", "params": { "split":
+            "train" } } } - { "type": "rows", "rows": [ { "messages": [ {"role": "user",
+            "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
+            world!"}, ] } ] }
        metadata:
          type: object
          additionalProperties:
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -15,18 +15,26 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho

 class DatasetPurpose(Enum):
    """
-    Purpose of the dataset. Each type has a different column format.
-    :cvar post-training/messages: The dataset contains messages used for post-training. Examples:
+    Purpose of the dataset. Each purpose has a required input data schema.
+
+    :cvar post-training/messages: The dataset contains messages used for post-training.
        {
            "messages": [
                {"role": "user", "content": "Hello, world!"},
                {"role": "assistant", "content": "Hello, world!"},
            ]
        }
+    :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column.
+        {
+            "messages": [
+                {"role": "user", "content": "What is the capital of France?"},
+            ],
+            "answer": "Paris"
+        }
    """

    post_training_messages = "post-training/messages"
-    eval_question_answer = "eval/question-answer"
+    eval_messages_answer = "eval/messages-answer"

    # TODO: add more schemas here

@ -145,7 +153,7 @@ class Datasets(Protocol):

        :param purpose: The purpose of the dataset. One of
            - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
-            - "eval/question-answer": The dataset contains a question and answer column.
+            - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column.
        :param source: The data source of the dataset. Examples:
           - {
               "type": "uri",
@ -157,9 +165,11 @@ class Datasets(Protocol):
           }
           - {
               "type": "huggingface",
-               "dataset_path": "tatsu-lab/alpaca",
-               "params": {
-                   "split": "train"
+               "huggingface": {
+                   "dataset_path": "tatsu-lab/alpaca",
+                   "params": {
+                       "split": "train"
+                   }
               }
           }
           - {