diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 821e5ed53..856c6e715 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6846,13 +6846,14 @@
"const": "dataset",
"default": "dataset"
},
- "schema": {
+ "purpose": {
"type": "string",
"enum": [
- "messages"
+ "post-training/messages",
+ "eval/question-answer"
],
- "title": "Schema",
- "description": "Schema of the dataset. Each type has a different column format."
+ "title": "DatasetPurpose",
+ "description": "Purpose of the dataset. Each type has a different column format."
},
"data_source": {
"$ref": "#/components/schemas/DataSource"
@@ -6889,7 +6890,7 @@
"provider_resource_id",
"provider_id",
"type",
- "schema",
+ "purpose",
"data_source",
"metadata"
],
@@ -6903,8 +6904,9 @@
"const": "huggingface",
"default": "huggingface"
},
- "dataset_path": {
- "type": "string"
+ "path": {
+ "type": "string",
+ "description": "The path to the dataset in Huggingface. E.g. - \"llamastack/simpleqa\""
},
"params": {
"type": "object",
@@ -6929,16 +6931,18 @@
"type": "object"
}
]
- }
+ },
+ "description": "The parameters for the dataset."
}
},
"additionalProperties": false,
"required": [
"type",
- "dataset_path",
+ "path",
"params"
],
- "title": "HuggingfaceDataSource"
+ "title": "HuggingfaceDataSource",
+ "description": "A dataset stored in Huggingface."
},
"RowsDataSource": {
"type": "object",
@@ -6974,7 +6978,8 @@
}
]
}
- }
+ },
+ "description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]"
}
},
"additionalProperties": false,
@@ -6982,7 +6987,8 @@
"type",
"rows"
],
- "title": "RowsDataSource"
+ "title": "RowsDataSource",
+ "description": "A dataset stored in rows."
},
"URIDataSource": {
"type": "object",
@@ -6993,7 +6999,8 @@
"default": "uri"
},
"uri": {
- "type": "string"
+ "type": "string",
+ "description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\""
}
},
"additionalProperties": false,
@@ -7001,7 +7008,8 @@
"type",
"uri"
],
- "title": "URIDataSource"
+ "title": "URIDataSource",
+ "description": "A dataset that can be obtained from a URI."
},
"Model": {
"type": "object",
@@ -9419,14 +9427,15 @@
"RegisterDatasetRequest": {
"type": "object",
"properties": {
- "schema": {
+ "purpose": {
"type": "string",
"enum": [
- "messages"
+ "post-training/messages",
+ "eval/question-answer"
],
- "description": "The schema format of the dataset. One of - messages: The dataset contains a messages column with list of messages for post-training."
+ "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column."
},
- "data_source": {
+ "source": {
"$ref": "#/components/schemas/DataSource",
"description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
},
@@ -9463,8 +9472,8 @@
},
"additionalProperties": false,
"required": [
- "schema",
- "data_source"
+ "purpose",
+ "source"
],
"title": "RegisterDatasetRequest"
},
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 21625827a..93ba4ba30 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4738,13 +4738,14 @@ components:
type: string
const: dataset
default: dataset
- schema:
+ purpose:
type: string
enum:
- - messages
- title: Schema
+ - post-training/messages
+ - eval/question-answer
+ title: DatasetPurpose
description: >-
- Schema of the dataset. Each type has a different column format.
+ Purpose of the dataset. Each type has a different column format.
data_source:
$ref: '#/components/schemas/DataSource'
metadata:
@@ -4763,7 +4764,7 @@ components:
- provider_resource_id
- provider_id
- type
- - schema
+ - purpose
- data_source
- metadata
title: Dataset
@@ -4774,8 +4775,10 @@ components:
type: string
const: huggingface
default: huggingface
- dataset_path:
+ path:
type: string
+ description: >-
+ The path to the dataset in Huggingface. E.g. - "llamastack/simpleqa"
params:
type: object
additionalProperties:
@@ -4786,12 +4789,14 @@ components:
- type: string
- type: array
- type: object
+ description: The parameters for the dataset.
additionalProperties: false
required:
- type
- - dataset_path
+ - path
- params
title: HuggingfaceDataSource
+ description: A dataset stored in Huggingface.
RowsDataSource:
type: object
properties:
@@ -4811,11 +4816,16 @@ components:
- type: string
- type: array
- type: object
+ description: >-
+ The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user",
+ "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
+ world!"}]} ]
additionalProperties: false
required:
- type
- rows
title: RowsDataSource
+ description: A dataset stored in rows.
URIDataSource:
type: object
properties:
@@ -4825,11 +4835,16 @@ components:
default: uri
uri:
type: string
+ description: >-
+ The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl"
+ - "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}"
additionalProperties: false
required:
- type
- uri
title: URIDataSource
+ description: >-
+ A dataset that can be obtained from a URI.
Model:
type: object
properties:
@@ -6367,14 +6382,16 @@ components:
RegisterDatasetRequest:
type: object
properties:
- schema:
+ purpose:
type: string
enum:
- - messages
+ - post-training/messages
+ - eval/question-answer
description: >-
- The schema format of the dataset. One of - messages: The dataset contains
- a messages column with list of messages for post-training.
- data_source:
+ The purpose of the dataset. One of - "post-training/messages": The dataset
+ contains a messages column with list of messages for post-training. -
+ "eval/question-answer": The dataset contains a question and answer column.
+ source:
$ref: '#/components/schemas/DataSource'
description: >-
The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
@@ -6401,8 +6418,8 @@ components:
The ID of the dataset. If not provided, a random ID will be generated.
additionalProperties: false
required:
- - schema
- - data_source
+ - purpose
+ - source
title: RegisterDatasetRequest
RegisterModelRequest:
type: object
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index b18dd204b..26ad85422 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
class DatasetPurpose(Enum):
"""
Purpose of the dataset. Each type has a different column format.
- :cvar tuning/messages: The dataset contains messages used for post-training. Examples:
+ :cvar post-training/messages: The dataset contains messages used for post-training. Examples:
{
"messages": [
{"role": "user", "content": "Hello, world!"},
@@ -25,12 +25,19 @@ class DatasetPurpose(Enum):
}
"""
- tuning_messages = "tuning/messages"
+ post_training_messages = "post-training/messages"
+ eval_question_answer = "eval/question-answer"
# TODO: add more schemas here
class DatasetType(Enum):
+ """
+ Type of the dataset source.
+ :cvar huggingface: The dataset is stored in Huggingface.
+ :cvar uri: The dataset can be obtained from a URI.
+ :cvar rows: The dataset is stored in rows.
+ """
huggingface = "huggingface"
uri = "uri"
rows = "rows"
@@ -38,19 +45,36 @@ class DatasetType(Enum):
@json_schema_type
class URIDataSource(BaseModel):
+ """A dataset that can be obtained from a URI.
+ :param uri: The dataset can be obtained from a URI. E.g.
+ - "https://mywebsite.com/mydata.jsonl"
+ - "lsfs://mydata.jsonl"
+ - "data:csv;base64,{base64_content}"
+ """
type: Literal["uri"] = "uri"
uri: str
@json_schema_type
class HuggingfaceDataSource(BaseModel):
+ """A dataset stored in Huggingface.
+ :param path: The path to the dataset in Huggingface. E.g.
+ - "llamastack/simpleqa"
+ :param params: The parameters for the dataset.
+ """
type: Literal["huggingface"] = "huggingface"
- dataset_path: str
+ path: str
params: Dict[str, Any]
@json_schema_type
class RowsDataSource(BaseModel):
+ """A dataset stored in rows.
+ :param rows: The dataset is stored in rows. E.g.
+ - [
+ {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
+ ]
+ """
type: Literal["rows"] = "rows"
rows: List[Dict[str, Any]]
@@ -65,7 +89,10 @@ DataSource = register_schema(
class CommonDatasetFields(BaseModel):
- schema: Schema
+ """
+ Common fields for a dataset.
+ """
+ purpose: DatasetPurpose
data_source: DataSource
metadata: Dict[str, Any] = Field(
default_factory=dict,
@@ -108,9 +135,10 @@ class Datasets(Protocol):
"""
Register a new dataset.
- :param schema: The schema format of the dataset. One of
- - messages: The dataset contains a messages column with list of messages for post-training.
- :param data_source: The data source of the dataset. Examples:
+ :param purpose: The purpose of the dataset. One of
+ - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
+ - "eval/question-answer": The dataset contains a question and answer column.
+ :param source: The data source of the dataset. Examples:
- {
"type": "uri",
"uri": "https://mywebsite.com/mydata.jsonl"