update

2025-03-11 18:29:55 -07:00 · 2025-03-11 18:29:55 -07:00 · 0e47c65051
commit 0e47c65051
parent 02aa9a1e85
3 changed files with 294 additions and 82 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6838,6 +6838,27 @@
                ],
                "title": "Benchmark"
            },
            "DataReference": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/URIDataReference"
                    },
                    {
                        "$ref": "#/components/schemas/HuggingfaceDataReference"
                    },
                    {
                        "$ref": "#/components/schemas/RowsDataReference"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "uri": "#/components/schemas/URIDataReference",
                        "huggingface": "#/components/schemas/HuggingfaceDataReference",
                        "rows": "#/components/schemas/RowsDataReference"
                    }
                }
            },
            "Dataset": {
                "type": "object",
                "properties": {
@ -6856,10 +6877,15 @@
                        "default": "dataset"
                    },
                    "schema": {
-                        "$ref": "#/components/schemas/Schema"
+                        "type": "string",
                        "enum": [
                            "jsonl_messages"
                        ],
                        "title": "Schema",
                        "description": "Schema of the dataset. Each type has a different column format."
                    },
-                    "uri": {
+                    "data_reference": {
-                        "type": "string"
+                        "$ref": "#/components/schemas/DataReference"
                    },
                    "metadata": {
                        "type": "object",
@ -6894,18 +6920,118 @@
                    "provider_id",
                    "type",
                    "schema",
-                    "uri",
+                    "data_reference",
                    "metadata"
                ],
                "title": "Dataset"
            },
-            "Schema": {
+            "HuggingfaceDataReference": {
-                "type": "string",
+                "type": "object",
-                "enum": [
+                "properties": {
-                    "jsonl_messages"
+                    "type": {
                        "type": "string",
                        "const": "huggingface",
                        "default": "huggingface"
                    },
                    "dataset_path": {
                        "type": "string"
                    },
                    "params": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "dataset_path",
                    "params"
                ],
-                "title": "Schema",
+                "title": "HuggingfaceDataReference"
-                "description": "Schema of the dataset. Each type has a different column format."
+            },
            "RowsDataReference": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "rows",
                        "default": "rows"
                    },
                    "rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "rows"
                ],
                "title": "RowsDataReference"
            },
            "URIDataReference": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "uri",
                        "default": "uri"
                    },
                    "uri": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "uri"
                ],
                "title": "URIDataReference"
            },
            "Model": {
                "type": "object",
@ -9255,38 +9381,15 @@
                "type": "object",
                "properties": {
                    "schema": {
-                        "$ref": "#/components/schemas/Schema",
+                        "type": "string",
                        "enum": [
                            "jsonl_messages"
                        ],
                        "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format"
                    },
-                    "uri": {
+                    "data_reference": {
-                        "type": "string",
+                        "$ref": "#/components/schemas/DataReference",
-                        "description": "The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca"
+                        "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }"
                    },
                    "uri_params": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters could be uri_params={\"split\": \"train\"}"
                    },
                    "metadata": {
                        "type": "object",
@ -9322,7 +9425,7 @@
                "additionalProperties": false,
                "required": [
                    "schema",
-                    "uri"
+                    "data_reference"
                ],
                "title": "RegisterDatasetRequest"
            },
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4731,6 +4731,17 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
    DataReference:
      oneOf:
        - $ref: '#/components/schemas/URIDataReference'
        - $ref: '#/components/schemas/HuggingfaceDataReference'
        - $ref: '#/components/schemas/RowsDataReference'
      discriminator:
        propertyName: type
        mapping:
          uri: '#/components/schemas/URIDataReference'
          huggingface: '#/components/schemas/HuggingfaceDataReference'
          rows: '#/components/schemas/RowsDataReference'
    Dataset:
      type: object
      properties:
@ -4745,9 +4756,14 @@ components:
          const: dataset
          default: dataset
        schema:
          $ref: '#/components/schemas/Schema'
        uri:
          type: string
          enum:
            - jsonl_messages
          title: Schema
          description: >-
            Schema of the dataset. Each type has a different column format.
        data_reference:
          $ref: '#/components/schemas/DataReference'
        metadata:
          type: object
          additionalProperties:
@ -4765,16 +4781,72 @@ components:
        - provider_id
        - type
        - schema
-        - uri
+        - data_reference
        - metadata
      title: Dataset
-    Schema:
+    HuggingfaceDataReference:
-      type: string
+      type: object
-      enum:
+      properties:
-        - jsonl_messages
+        type:
-      title: Schema
+          type: string
-      description: >-
+          const: huggingface
-        Schema of the dataset. Each type has a different column format.
+          default: huggingface
        dataset_path:
          type: string
        params:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - type
        - dataset_path
        - params
      title: HuggingfaceDataReference
    RowsDataReference:
      type: object
      properties:
        type:
          type: string
          const: rows
          default: rows
        rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
      additionalProperties: false
      required:
        - type
        - rows
      title: RowsDataReference
    URIDataReference:
      type: object
      properties:
        type:
          type: string
          const: uri
          default: uri
        uri:
          type: string
      additionalProperties: false
      required:
        - type
        - uri
      title: URIDataReference
    Model:
      type: object
      properties:
@ -6272,28 +6344,20 @@ components:
      type: object
      properties:
        schema:
-          $ref: '#/components/schemas/Schema'
+          type: string
          enum:
            - jsonl_messages
          description: >-
            The schema format of the dataset. One of - jsonl_messages: The dataset
            is a JSONL file with messages in column format
-        uri:
+        data_reference:
-          type: string
+          $ref: '#/components/schemas/DataReference'
          description: >-
-            The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl
+            The data reference of the dataset. Examples: - { "type": "uri", "uri":
-            - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca
+            "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl"
-        uri_params:
+            } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params":
-          type: object
+            { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello,
-          additionalProperties:
+            world!"}] }
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters
            could be uri_params={"split": "train"}
        metadata:
          type: object
          additionalProperties:
@ -6313,7 +6377,7 @@ components:
      additionalProperties: false
      required:
        - schema
-        - uri
+        - data_reference
      title: RegisterDatasetRequest
    RegisterModelRequest:
      type: object
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -5,12 +5,12 @@
 # the root directory of this source tree.
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Protocol
+from typing import Any, Dict, List, Literal, Optional, Protocol, Annotated, Union
 from pydantic import BaseModel, Field
 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.schema_utils import json_schema_type, webmethod, register_schema
 class Schema(Enum):
@ -29,9 +29,42 @@ class Schema(Enum):
    # TODO: add more schemas here
 class DatasetType(Enum):
    huggingface = "huggingface"
    uri = "uri"
    rows = "rows"
@json_schema_type
 class URIDataReference(BaseModel):
    type: Literal["uri"] = "uri"
    uri: str
@json_schema_type
 class HuggingfaceDataReference(BaseModel):
    type: Literal["huggingface"] = "huggingface"
    dataset_path: str
    params: Dict[str, Any]
@json_schema_type
 class RowsDataReference(BaseModel):
    type: Literal["rows"] = "rows"
    rows: List[Dict[str, Any]]
 DataReference = register_schema(
    Annotated[
        Union[URIDataReference, HuggingfaceDataReference, RowsDataReference],
        Field(discriminator="type"),
    ],
    name="DataReference",
 )
 class CommonDatasetFields(BaseModel):
    schema: Schema
-    uri: str
+    data_reference: DataReference
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="Any additional metadata for this dataset",
@ -66,8 +99,7 @@ class Datasets(Protocol):
    async def register_dataset(
        self,
        schema: Schema,
-        uri: str,
+        data_reference: DataReference,
        uri_params: Optional[Dict[str, Any]] = None,
        metadata: Optional[Dict[str, Any]] = None,
        dataset_id: Optional[str] = None,
    ) -> Dataset:
@ -76,13 +108,26 @@ class Datasets(Protocol):
        :param schema: The schema format of the dataset. One of
            - jsonl_messages: The dataset is a JSONL file with messages in column format
-        :param uri: The URI of the dataset. Examples:
+        :param data_reference: The data reference of the dataset. Examples:
-           - file://mydata.jsonl
+           - {
-           - s3://mybucket/myfile.jsonl
+               "type": "uri",
-           - https://mywebsite.com/myfile.jsonl
+               "uri": "https://mywebsite.com/mydata.jsonl"
-           - huggingface://tatsu-lab/alpaca
+           }
-        :param uri_params: The parameters for the URI.
+           - {
-           - E.g. If URL is a huggingface dataset, parameters could be uri_params={"split": "train"}
+               "type": "uri",
               "uri": "lsfs://mydata.jsonl"
           }
           - {
               "type": "huggingface",
               "dataset_path": "tatsu-lab/alpaca",
               "params": {
                   "split": "train"
               }
           }
           - {
               "type": "rows",
               "rows": [{"message": "Hello, world!"}]
           }
        :param metadata: The metadata for the dataset.
           - E.g. {"description": "My dataset"}
        :param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.