diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 71437bd90..1df7a63a1 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6838,6 +6838,27 @@
],
"title": "Benchmark"
},
+ "DataReference": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/URIDataReference"
+ },
+ {
+ "$ref": "#/components/schemas/HuggingfaceDataReference"
+ },
+ {
+ "$ref": "#/components/schemas/RowsDataReference"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "uri": "#/components/schemas/URIDataReference",
+ "huggingface": "#/components/schemas/HuggingfaceDataReference",
+ "rows": "#/components/schemas/RowsDataReference"
+ }
+ }
+ },
"Dataset": {
"type": "object",
"properties": {
@@ -6856,10 +6877,15 @@
"default": "dataset"
},
"schema": {
- "$ref": "#/components/schemas/Schema"
+ "type": "string",
+ "enum": [
+ "jsonl_messages"
+ ],
+ "title": "Schema",
+ "description": "Schema of the dataset. Each type has a different column format."
},
- "uri": {
- "type": "string"
+ "data_reference": {
+ "$ref": "#/components/schemas/DataReference"
},
"metadata": {
"type": "object",
@@ -6894,18 +6920,118 @@
"provider_id",
"type",
"schema",
- "uri",
+ "data_reference",
"metadata"
],
"title": "Dataset"
},
- "Schema": {
- "type": "string",
- "enum": [
- "jsonl_messages"
+ "HuggingfaceDataReference": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "huggingface",
+ "default": "huggingface"
+ },
+ "dataset_path": {
+ "type": "string"
+ },
+ "params": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "dataset_path",
+ "params"
],
- "title": "Schema",
- "description": "Schema of the dataset. Each type has a different column format."
+ "title": "HuggingfaceDataReference"
+ },
+ "RowsDataReference": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "rows",
+ "default": "rows"
+ },
+ "rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "rows"
+ ],
+ "title": "RowsDataReference"
+ },
+ "URIDataReference": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "uri",
+ "default": "uri"
+ },
+ "uri": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "uri"
+ ],
+ "title": "URIDataReference"
},
"Model": {
"type": "object",
@@ -9255,38 +9381,15 @@
"type": "object",
"properties": {
"schema": {
- "$ref": "#/components/schemas/Schema",
+ "type": "string",
+ "enum": [
+ "jsonl_messages"
+ ],
"description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format"
},
- "uri": {
- "type": "string",
- "description": "The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca"
- },
- "uri_params": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters could be uri_params={\"split\": \"train\"}"
+ "data_reference": {
+ "$ref": "#/components/schemas/DataReference",
+ "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }"
},
"metadata": {
"type": "object",
@@ -9322,7 +9425,7 @@
"additionalProperties": false,
"required": [
"schema",
- "uri"
+ "data_reference"
],
"title": "RegisterDatasetRequest"
},
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 5f8d0e522..9d5ed17c7 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4731,6 +4731,17 @@ components:
- scoring_functions
- metadata
title: Benchmark
+ DataReference:
+ oneOf:
+ - $ref: '#/components/schemas/URIDataReference'
+ - $ref: '#/components/schemas/HuggingfaceDataReference'
+ - $ref: '#/components/schemas/RowsDataReference'
+ discriminator:
+ propertyName: type
+ mapping:
+ uri: '#/components/schemas/URIDataReference'
+ huggingface: '#/components/schemas/HuggingfaceDataReference'
+ rows: '#/components/schemas/RowsDataReference'
Dataset:
type: object
properties:
@@ -4745,9 +4756,14 @@ components:
const: dataset
default: dataset
schema:
- $ref: '#/components/schemas/Schema'
- uri:
type: string
+ enum:
+ - jsonl_messages
+ title: Schema
+ description: >-
+ Schema of the dataset. Each type has a different column format.
+ data_reference:
+ $ref: '#/components/schemas/DataReference'
metadata:
type: object
additionalProperties:
@@ -4765,16 +4781,72 @@ components:
- provider_id
- type
- schema
- - uri
+ - data_reference
- metadata
title: Dataset
- Schema:
- type: string
- enum:
- - jsonl_messages
- title: Schema
- description: >-
- Schema of the dataset. Each type has a different column format.
+ HuggingfaceDataReference:
+ type: object
+ properties:
+ type:
+ type: string
+ const: huggingface
+ default: huggingface
+ dataset_path:
+ type: string
+ params:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - type
+ - dataset_path
+ - params
+ title: HuggingfaceDataReference
+ RowsDataReference:
+ type: object
+ properties:
+ type:
+ type: string
+ const: rows
+ default: rows
+ rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - type
+ - rows
+ title: RowsDataReference
+ URIDataReference:
+ type: object
+ properties:
+ type:
+ type: string
+ const: uri
+ default: uri
+ uri:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - uri
+ title: URIDataReference
Model:
type: object
properties:
@@ -6272,28 +6344,20 @@ components:
type: object
properties:
schema:
- $ref: '#/components/schemas/Schema'
+ type: string
+ enum:
+ - jsonl_messages
description: >-
The schema format of the dataset. One of - jsonl_messages: The dataset
is a JSONL file with messages in column format
- uri:
- type: string
+ data_reference:
+ $ref: '#/components/schemas/DataReference'
description: >-
- The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl
- - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca
- uri_params:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters
- could be uri_params={"split": "train"}
+ The data reference of the dataset. Examples: - { "type": "uri", "uri":
+ "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl"
+ } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params":
+ { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello,
+ world!"}] }
metadata:
type: object
additionalProperties:
@@ -6313,7 +6377,7 @@ components:
additionalProperties: false
required:
- schema
- - uri
+ - data_reference
title: RegisterDatasetRequest
RegisterModelRequest:
type: object
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 157431ed2..049b6e8be 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -5,12 +5,12 @@
# the root directory of this source tree.
from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Protocol
+from typing import Any, Dict, List, Literal, Optional, Protocol, Annotated, Union
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.schema_utils import json_schema_type, webmethod, register_schema
class Schema(Enum):
@@ -29,9 +29,42 @@ class Schema(Enum):
# TODO: add more schemas here
+class DatasetType(Enum):
+ huggingface = "huggingface"
+ uri = "uri"
+ rows = "rows"
+
+
+@json_schema_type
+class URIDataReference(BaseModel):
+ type: Literal["uri"] = "uri"
+ uri: str
+
+
+@json_schema_type
+class HuggingfaceDataReference(BaseModel):
+ type: Literal["huggingface"] = "huggingface"
+ dataset_path: str
+ params: Dict[str, Any]
+
+
+@json_schema_type
+class RowsDataReference(BaseModel):
+ type: Literal["rows"] = "rows"
+ rows: List[Dict[str, Any]]
+
+
+DataReference = register_schema(
+ Annotated[
+ Union[URIDataReference, HuggingfaceDataReference, RowsDataReference],
+ Field(discriminator="type"),
+ ],
+ name="DataReference",
+)
+
class CommonDatasetFields(BaseModel):
schema: Schema
- uri: str
+ data_reference: DataReference
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Any additional metadata for this dataset",
@@ -66,8 +99,7 @@ class Datasets(Protocol):
async def register_dataset(
self,
schema: Schema,
- uri: str,
- uri_params: Optional[Dict[str, Any]] = None,
+ data_reference: DataReference,
metadata: Optional[Dict[str, Any]] = None,
dataset_id: Optional[str] = None,
) -> Dataset:
@@ -76,13 +108,26 @@ class Datasets(Protocol):
:param schema: The schema format of the dataset. One of
- jsonl_messages: The dataset is a JSONL file with messages in column format
- :param uri: The URI of the dataset. Examples:
- - file://mydata.jsonl
- - s3://mybucket/myfile.jsonl
- - https://mywebsite.com/myfile.jsonl
- - huggingface://tatsu-lab/alpaca
- :param uri_params: The parameters for the URI.
- - E.g. If URL is a huggingface dataset, parameters could be uri_params={"split": "train"}
+ :param data_reference: The data reference of the dataset. Examples:
+ - {
+ "type": "uri",
+ "uri": "https://mywebsite.com/mydata.jsonl"
+ }
+ - {
+ "type": "uri",
+ "uri": "lsfs://mydata.jsonl"
+ }
+ - {
+ "type": "huggingface",
+ "dataset_path": "tatsu-lab/alpaca",
+ "params": {
+ "split": "train"
+ }
+ }
+ - {
+ "type": "rows",
+ "rows": [{"message": "Hello, world!"}]
+ }
:param metadata: The metadata for the dataset.
- E.g. {"description": "My dataset"}
:param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.