This commit is contained in:
Xi Yan 2025-03-12 18:46:40 -07:00
parent 18de4cd08a
commit a3173e8284
3 changed files with 95 additions and 41 deletions

View file

@ -6846,13 +6846,14 @@
"const": "dataset", "const": "dataset",
"default": "dataset" "default": "dataset"
}, },
"schema": { "purpose": {
"type": "string", "type": "string",
"enum": [ "enum": [
"messages" "post-training/messages",
"eval/question-answer"
], ],
"title": "Schema", "title": "DatasetPurpose",
"description": "Schema of the dataset. Each type has a different column format." "description": "Purpose of the dataset. Each type has a different column format."
}, },
"data_source": { "data_source": {
"$ref": "#/components/schemas/DataSource" "$ref": "#/components/schemas/DataSource"
@ -6889,7 +6890,7 @@
"provider_resource_id", "provider_resource_id",
"provider_id", "provider_id",
"type", "type",
"schema", "purpose",
"data_source", "data_source",
"metadata" "metadata"
], ],
@ -6903,8 +6904,9 @@
"const": "huggingface", "const": "huggingface",
"default": "huggingface" "default": "huggingface"
}, },
"dataset_path": { "path": {
"type": "string" "type": "string",
"description": "The path to the dataset in Huggingface. E.g. - \"llamastack/simpleqa\""
}, },
"params": { "params": {
"type": "object", "type": "object",
@ -6929,16 +6931,18 @@
"type": "object" "type": "object"
} }
] ]
} },
"description": "The parameters for the dataset."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type", "type",
"dataset_path", "path",
"params" "params"
], ],
"title": "HuggingfaceDataSource" "title": "HuggingfaceDataSource",
"description": "A dataset stored in Huggingface."
}, },
"RowsDataSource": { "RowsDataSource": {
"type": "object", "type": "object",
@ -6974,7 +6978,8 @@
} }
] ]
} }
} },
"description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6982,7 +6987,8 @@
"type", "type",
"rows" "rows"
], ],
"title": "RowsDataSource" "title": "RowsDataSource",
"description": "A dataset stored in rows."
}, },
"URIDataSource": { "URIDataSource": {
"type": "object", "type": "object",
@ -6993,7 +6999,8 @@
"default": "uri" "default": "uri"
}, },
"uri": { "uri": {
"type": "string" "type": "string",
"description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\""
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -7001,7 +7008,8 @@
"type", "type",
"uri" "uri"
], ],
"title": "URIDataSource" "title": "URIDataSource",
"description": "A dataset that can be obtained from a URI."
}, },
"Model": { "Model": {
"type": "object", "type": "object",
@ -9419,14 +9427,15 @@
"RegisterDatasetRequest": { "RegisterDatasetRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"schema": { "purpose": {
"type": "string", "type": "string",
"enum": [ "enum": [
"messages" "post-training/messages",
"eval/question-answer"
], ],
"description": "The schema format of the dataset. One of - messages: The dataset contains a messages column with list of messages for post-training." "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. - \"eval/question-answer\": The dataset contains a question and answer column."
}, },
"data_source": { "source": {
"$ref": "#/components/schemas/DataSource", "$ref": "#/components/schemas/DataSource",
"description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }" "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
}, },
@ -9463,8 +9472,8 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"schema", "purpose",
"data_source" "source"
], ],
"title": "RegisterDatasetRequest" "title": "RegisterDatasetRequest"
}, },

View file

@ -4738,13 +4738,14 @@ components:
type: string type: string
const: dataset const: dataset
default: dataset default: dataset
schema: purpose:
type: string type: string
enum: enum:
- messages - post-training/messages
title: Schema - eval/question-answer
title: DatasetPurpose
description: >- description: >-
Schema of the dataset. Each type has a different column format. Purpose of the dataset. Each type has a different column format.
data_source: data_source:
$ref: '#/components/schemas/DataSource' $ref: '#/components/schemas/DataSource'
metadata: metadata:
@ -4763,7 +4764,7 @@ components:
- provider_resource_id - provider_resource_id
- provider_id - provider_id
- type - type
- schema - purpose
- data_source - data_source
- metadata - metadata
title: Dataset title: Dataset
@ -4774,8 +4775,10 @@ components:
type: string type: string
const: huggingface const: huggingface
default: huggingface default: huggingface
dataset_path: path:
type: string type: string
description: >-
The path to the dataset in Huggingface. E.g. - "llamastack/simpleqa"
params: params:
type: object type: object
additionalProperties: additionalProperties:
@ -4786,12 +4789,14 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The parameters for the dataset.
additionalProperties: false additionalProperties: false
required: required:
- type - type
- dataset_path - path
- params - params
title: HuggingfaceDataSource title: HuggingfaceDataSource
description: A dataset stored in Huggingface.
RowsDataSource: RowsDataSource:
type: object type: object
properties: properties:
@ -4811,11 +4816,16 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: >-
The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user",
"content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
world!"}]} ]
additionalProperties: false additionalProperties: false
required: required:
- type - type
- rows - rows
title: RowsDataSource title: RowsDataSource
description: A dataset stored in rows.
URIDataSource: URIDataSource:
type: object type: object
properties: properties:
@ -4825,11 +4835,16 @@ components:
default: uri default: uri
uri: uri:
type: string type: string
description: >-
The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl"
- "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}"
additionalProperties: false additionalProperties: false
required: required:
- type - type
- uri - uri
title: URIDataSource title: URIDataSource
description: >-
A dataset that can be obtained from a URI.
Model: Model:
type: object type: object
properties: properties:
@ -6367,14 +6382,16 @@ components:
RegisterDatasetRequest: RegisterDatasetRequest:
type: object type: object
properties: properties:
schema: purpose:
type: string type: string
enum: enum:
- messages - post-training/messages
- eval/question-answer
description: >- description: >-
The schema format of the dataset. One of - messages: The dataset contains The purpose of the dataset. One of - "post-training/messages": The dataset
a messages column with list of messages for post-training. contains a messages column with list of messages for post-training. -
data_source: "eval/question-answer": The dataset contains a question and answer column.
source:
$ref: '#/components/schemas/DataSource' $ref: '#/components/schemas/DataSource'
description: >- description: >-
The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl" The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
@ -6401,8 +6418,8 @@ components:
The ID of the dataset. If not provided, a random ID will be generated. The ID of the dataset. If not provided, a random ID will be generated.
additionalProperties: false additionalProperties: false
required: required:
- schema - purpose
- data_source - source
title: RegisterDatasetRequest title: RegisterDatasetRequest
RegisterModelRequest: RegisterModelRequest:
type: object type: object

View file

@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
class DatasetPurpose(Enum): class DatasetPurpose(Enum):
""" """
Purpose of the dataset. Each type has a different column format. Purpose of the dataset. Each type has a different column format.
:cvar tuning/messages: The dataset contains messages used for post-training. Examples: :cvar post-training/messages: The dataset contains messages used for post-training. Examples:
{ {
"messages": [ "messages": [
{"role": "user", "content": "Hello, world!"}, {"role": "user", "content": "Hello, world!"},
@ -25,12 +25,19 @@ class DatasetPurpose(Enum):
} }
""" """
tuning_messages = "tuning/messages" post_training_messages = "post-training/messages"
eval_question_answer = "eval/question-answer"
# TODO: add more schemas here # TODO: add more schemas here
class DatasetType(Enum): class DatasetType(Enum):
"""
Type of the dataset source.
:cvar huggingface: The dataset is stored in Huggingface.
:cvar uri: The dataset can be obtained from a URI.
:cvar rows: The dataset is stored in rows.
"""
huggingface = "huggingface" huggingface = "huggingface"
uri = "uri" uri = "uri"
rows = "rows" rows = "rows"
@ -38,19 +45,36 @@ class DatasetType(Enum):
@json_schema_type @json_schema_type
class URIDataSource(BaseModel): class URIDataSource(BaseModel):
"""A dataset that can be obtained from a URI.
:param uri: The dataset can be obtained from a URI. E.g.
- "https://mywebsite.com/mydata.jsonl"
- "lsfs://mydata.jsonl"
- "data:csv;base64,{base64_content}"
"""
type: Literal["uri"] = "uri" type: Literal["uri"] = "uri"
uri: str uri: str
@json_schema_type @json_schema_type
class HuggingfaceDataSource(BaseModel): class HuggingfaceDataSource(BaseModel):
"""A dataset stored in Huggingface.
:param path: The path to the dataset in Huggingface. E.g.
- "llamastack/simpleqa"
:param params: The parameters for the dataset.
"""
type: Literal["huggingface"] = "huggingface" type: Literal["huggingface"] = "huggingface"
dataset_path: str path: str
params: Dict[str, Any] params: Dict[str, Any]
@json_schema_type @json_schema_type
class RowsDataSource(BaseModel): class RowsDataSource(BaseModel):
"""A dataset stored in rows.
:param rows: The dataset is stored in rows. E.g.
- [
{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
]
"""
type: Literal["rows"] = "rows" type: Literal["rows"] = "rows"
rows: List[Dict[str, Any]] rows: List[Dict[str, Any]]
@ -65,7 +89,10 @@ DataSource = register_schema(
class CommonDatasetFields(BaseModel): class CommonDatasetFields(BaseModel):
schema: Schema """
Common fields for a dataset.
"""
purpose: DatasetPurpose
data_source: DataSource data_source: DataSource
metadata: Dict[str, Any] = Field( metadata: Dict[str, Any] = Field(
default_factory=dict, default_factory=dict,
@ -108,9 +135,10 @@ class Datasets(Protocol):
""" """
Register a new dataset. Register a new dataset.
:param schema: The schema format of the dataset. One of :param purpose: The purpose of the dataset. One of
- messages: The dataset contains a messages column with list of messages for post-training. - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
:param data_source: The data source of the dataset. Examples: - "eval/question-answer": The dataset contains a question and answer column.
:param source: The data source of the dataset. Examples:
- { - {
"type": "uri", "type": "uri",
"uri": "https://mywebsite.com/mydata.jsonl" "uri": "https://mywebsite.com/mydata.jsonl"