comment

2025-03-12 00:13:27 -07:00 · 2025-03-12 00:13:27 -07:00 · 0abedd070c
commit 0abedd070c
parent 817331e76e
3 changed files with 63 additions and 62 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2518,7 +2518,7 @@
                "tags": [
                    "Datasets"
                ],
-                "description": "Register a new dataset through a file or",
+                "description": "Register a new dataset.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -6838,24 +6838,24 @@
                ],
                "title": "Benchmark"
            },
-            "DataReference": {
+            "DataSource": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/URIDataReference"
+                        "$ref": "#/components/schemas/URIDataSource"
                    },
                    {
-                        "$ref": "#/components/schemas/HuggingfaceDataReference"
+                        "$ref": "#/components/schemas/HuggingfaceDataSource"
                    },
                    {
-                        "$ref": "#/components/schemas/RowsDataReference"
+                        "$ref": "#/components/schemas/RowsDataSource"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
-                        "uri": "#/components/schemas/URIDataReference",
+                        "uri": "#/components/schemas/URIDataSource",
-                        "huggingface": "#/components/schemas/HuggingfaceDataReference",
+                        "huggingface": "#/components/schemas/HuggingfaceDataSource",
-                        "rows": "#/components/schemas/RowsDataReference"
+                        "rows": "#/components/schemas/RowsDataSource"
                    }
                }
            },
@ -6879,13 +6879,13 @@
                    "schema": {
                        "type": "string",
                        "enum": [
-                            "jsonl_messages"
+                            "messages"
                        ],
                        "title": "Schema",
                        "description": "Schema of the dataset. Each type has a different column format."
                    },
-                    "data_reference": {
+                    "data_source": {
-                        "$ref": "#/components/schemas/DataReference"
+                        "$ref": "#/components/schemas/DataSource"
                    },
                    "metadata": {
                        "type": "object",
@ -6920,12 +6920,12 @@
                    "provider_id",
                    "type",
                    "schema",
-                    "data_reference",
+                    "data_source",
                    "metadata"
                ],
                "title": "Dataset"
            },
-            "HuggingfaceDataReference": {
+            "HuggingfaceDataSource": {
                "type": "object",
                "properties": {
                    "type": {
@ -6968,9 +6968,9 @@
                    "dataset_path",
                    "params"
                ],
-                "title": "HuggingfaceDataReference"
+                "title": "HuggingfaceDataSource"
            },
-            "RowsDataReference": {
+            "RowsDataSource": {
                "type": "object",
                "properties": {
                    "type": {
@ -7012,9 +7012,9 @@
                    "type",
                    "rows"
                ],
-                "title": "RowsDataReference"
+                "title": "RowsDataSource"
            },
-            "URIDataReference": {
+            "URIDataSource": {
                "type": "object",
                "properties": {
                    "type": {
@ -7031,7 +7031,7 @@
                    "type",
                    "uri"
                ],
-                "title": "URIDataReference"
+                "title": "URIDataSource"
            },
            "Model": {
                "type": "object",
@ -9383,13 +9383,13 @@
                    "schema": {
                        "type": "string",
                        "enum": [
-                            "jsonl_messages"
+                            "messages"
                        ],
                        "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format"
                    },
-                    "data_reference": {
+                    "data_source": {
-                        "$ref": "#/components/schemas/DataReference",
+                        "$ref": "#/components/schemas/DataSource",
-                        "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }"
+                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
                    },
                    "metadata": {
                        "type": "object",
@ -9425,7 +9425,7 @@
                "additionalProperties": false,
                "required": [
                    "schema",
-                    "data_reference"
+                    "data_source"
                ],
                "title": "RegisterDatasetRequest"
            },
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1698,7 +1698,7 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Datasets
-      description: Register a new dataset through a file or
+      description: Register a new dataset.
      parameters: []
      requestBody:
        content:
@ -4731,17 +4731,17 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
-    DataReference:
+    DataSource:
      oneOf:
-        - $ref: '#/components/schemas/URIDataReference'
+        - $ref: '#/components/schemas/URIDataSource'
-        - $ref: '#/components/schemas/HuggingfaceDataReference'
+        - $ref: '#/components/schemas/HuggingfaceDataSource'
-        - $ref: '#/components/schemas/RowsDataReference'
+        - $ref: '#/components/schemas/RowsDataSource'
      discriminator:
        propertyName: type
        mapping:
-          uri: '#/components/schemas/URIDataReference'
+          uri: '#/components/schemas/URIDataSource'
-          huggingface: '#/components/schemas/HuggingfaceDataReference'
+          huggingface: '#/components/schemas/HuggingfaceDataSource'
-          rows: '#/components/schemas/RowsDataReference'
+          rows: '#/components/schemas/RowsDataSource'
    Dataset:
      type: object
      properties:
@ -4758,12 +4758,12 @@ components:
        schema:
          type: string
          enum:
-            - jsonl_messages
+            - messages
          title: Schema
          description: >-
            Schema of the dataset. Each type has a different column format.
-        data_reference:
+        data_source:
-          $ref: '#/components/schemas/DataReference'
+          $ref: '#/components/schemas/DataSource'
        metadata:
          type: object
          additionalProperties:
@ -4781,10 +4781,10 @@ components:
        - provider_id
        - type
        - schema
-        - data_reference
+        - data_source
        - metadata
      title: Dataset
-    HuggingfaceDataReference:
+    HuggingfaceDataSource:
      type: object
      properties:
        type:
@ -4808,8 +4808,8 @@ components:
        - type
        - dataset_path
        - params
-      title: HuggingfaceDataReference
+      title: HuggingfaceDataSource
-    RowsDataReference:
+    RowsDataSource:
      type: object
      properties:
        type:
@ -4832,8 +4832,8 @@ components:
      required:
        - type
        - rows
-      title: RowsDataReference
+      title: RowsDataSource
-    URIDataReference:
+    URIDataSource:
      type: object
      properties:
        type:
@ -4846,7 +4846,7 @@ components:
      required:
        - type
        - uri
-      title: URIDataReference
+      title: URIDataSource
    Model:
      type: object
      properties:
@ -6346,18 +6346,19 @@ components:
        schema:
          type: string
          enum:
-            - jsonl_messages
+            - messages
          description: >-
            The schema format of the dataset. One of - jsonl_messages: The dataset
            is a JSONL file with messages in column format
-        data_reference:
+        data_source:
-          $ref: '#/components/schemas/DataReference'
+          $ref: '#/components/schemas/DataSource'
          description: >-
-            The data reference of the dataset. Examples: - { "type": "uri", "uri":
+            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
-            "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl"
+            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface",
-            } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params":
+            "dataset_path": "tatsu-lab/alpaca", "params": { "split": "train" } } -
-            { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello,
+            { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content":
-            world!"}] }
+            "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ]
            } ] }
        metadata:
          type: object
          additionalProperties:
@ -6377,7 +6378,7 @@ components:
      additionalProperties: false
      required:
        - schema
-        - data_reference
+        - data_source
      title: RegisterDatasetRequest
    RegisterModelRequest:
      type: object
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
 class Schema(Enum):
    """
    Schema of the dataset. Each type has a different column format.
-    :cvar jsonl_messages: The dataset is a JSONL file with messages. Examples:
+    :cvar messages: The dataset contains messages used for post-training. Examples:
        {
            "messages": [
                {"role": "user", "content": "Hello, world!"},
@ -25,7 +25,7 @@ class Schema(Enum):
        }
    """
-    jsonl_messages = "jsonl_messages"
+    messages = "messages"
    # TODO: add more schemas here
@ -36,36 +36,36 @@ class DatasetType(Enum):
@json_schema_type
-class URIDataReference(BaseModel):
+class URIDataSource(BaseModel):
    type: Literal["uri"] = "uri"
    uri: str
@json_schema_type
-class HuggingfaceDataReference(BaseModel):
+class HuggingfaceDataSource(BaseModel):
    type: Literal["huggingface"] = "huggingface"
    dataset_path: str
    params: Dict[str, Any]
@json_schema_type
-class RowsDataReference(BaseModel):
+class RowsDataSource(BaseModel):
    type: Literal["rows"] = "rows"
    rows: List[Dict[str, Any]]
-DataReference = register_schema(
+DataSource = register_schema(
    Annotated[
-        Union[URIDataReference, HuggingfaceDataReference, RowsDataReference],
+        Union[URIDataSource, HuggingfaceDataSource, RowsDataSource],
        Field(discriminator="type"),
    ],
-    name="DataReference",
+    name="DataSource",
 )
 class CommonDatasetFields(BaseModel):
    schema: Schema
-    data_reference: DataReference
+    data_source: DataSource
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="Any additional metadata for this dataset",
@ -100,16 +100,16 @@ class Datasets(Protocol):
    async def register_dataset(
        self,
        schema: Schema,
-        data_reference: DataReference,
+        data_source: DataSource,
        metadata: Optional[Dict[str, Any]] = None,
        dataset_id: Optional[str] = None,
    ) -> Dataset:
        """
-        Register a new dataset through a file or
+        Register a new dataset.
        :param schema: The schema format of the dataset. One of
            - jsonl_messages: The dataset is a JSONL file with messages in column format
-        :param data_reference: The data reference of the dataset. Examples:
+        :param data_source: The data source of the dataset. Examples:
           - {
               "type": "uri",
               "uri": "https://mywebsite.com/mydata.jsonl"