This commit is contained in:
Xi Yan 2025-03-11 18:29:55 -07:00
parent 02aa9a1e85
commit 0e47c65051
3 changed files with 294 additions and 82 deletions

View file

@ -6838,6 +6838,27 @@
], ],
"title": "Benchmark" "title": "Benchmark"
}, },
"DataReference": {
"oneOf": [
{
"$ref": "#/components/schemas/URIDataReference"
},
{
"$ref": "#/components/schemas/HuggingfaceDataReference"
},
{
"$ref": "#/components/schemas/RowsDataReference"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"uri": "#/components/schemas/URIDataReference",
"huggingface": "#/components/schemas/HuggingfaceDataReference",
"rows": "#/components/schemas/RowsDataReference"
}
}
},
"Dataset": { "Dataset": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -6856,10 +6877,15 @@
"default": "dataset" "default": "dataset"
}, },
"schema": { "schema": {
"$ref": "#/components/schemas/Schema" "type": "string",
"enum": [
"jsonl_messages"
],
"title": "Schema",
"description": "Schema of the dataset. Each type has a different column format."
}, },
"uri": { "data_reference": {
"type": "string" "$ref": "#/components/schemas/DataReference"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -6894,18 +6920,118 @@
"provider_id", "provider_id",
"type", "type",
"schema", "schema",
"uri", "data_reference",
"metadata" "metadata"
], ],
"title": "Dataset" "title": "Dataset"
}, },
"Schema": { "HuggingfaceDataReference": {
"type": "string", "type": "object",
"enum": [ "properties": {
"jsonl_messages" "type": {
"type": "string",
"const": "huggingface",
"default": "huggingface"
},
"dataset_path": {
"type": "string"
},
"params": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"additionalProperties": false,
"required": [
"type",
"dataset_path",
"params"
], ],
"title": "Schema", "title": "HuggingfaceDataReference"
"description": "Schema of the dataset. Each type has a different column format." },
"RowsDataReference": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "rows",
"default": "rows"
},
"rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
}
},
"additionalProperties": false,
"required": [
"type",
"rows"
],
"title": "RowsDataReference"
},
"URIDataReference": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "uri",
"default": "uri"
},
"uri": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"uri"
],
"title": "URIDataReference"
}, },
"Model": { "Model": {
"type": "object", "type": "object",
@ -9255,38 +9381,15 @@
"type": "object", "type": "object",
"properties": { "properties": {
"schema": { "schema": {
"$ref": "#/components/schemas/Schema", "type": "string",
"enum": [
"jsonl_messages"
],
"description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format" "description": "The schema format of the dataset. One of - jsonl_messages: The dataset is a JSONL file with messages in column format"
}, },
"uri": { "data_reference": {
"type": "string", "$ref": "#/components/schemas/DataReference",
"description": "The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl - https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca" "description": "The data reference of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } - { \"type\": \"rows\", \"rows\": [{\"message\": \"Hello, world!\"}] }"
},
"uri_params": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters could be uri_params={\"split\": \"train\"}"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -9322,7 +9425,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"schema", "schema",
"uri" "data_reference"
], ],
"title": "RegisterDatasetRequest" "title": "RegisterDatasetRequest"
}, },

View file

@ -4731,6 +4731,17 @@ components:
- scoring_functions - scoring_functions
- metadata - metadata
title: Benchmark title: Benchmark
DataReference:
oneOf:
- $ref: '#/components/schemas/URIDataReference'
- $ref: '#/components/schemas/HuggingfaceDataReference'
- $ref: '#/components/schemas/RowsDataReference'
discriminator:
propertyName: type
mapping:
uri: '#/components/schemas/URIDataReference'
huggingface: '#/components/schemas/HuggingfaceDataReference'
rows: '#/components/schemas/RowsDataReference'
Dataset: Dataset:
type: object type: object
properties: properties:
@ -4745,9 +4756,14 @@ components:
const: dataset const: dataset
default: dataset default: dataset
schema: schema:
$ref: '#/components/schemas/Schema'
uri:
type: string type: string
enum:
- jsonl_messages
title: Schema
description: >-
Schema of the dataset. Each type has a different column format.
data_reference:
$ref: '#/components/schemas/DataReference'
metadata: metadata:
type: object type: object
additionalProperties: additionalProperties:
@ -4765,16 +4781,72 @@ components:
- provider_id - provider_id
- type - type
- schema - schema
- uri - data_reference
- metadata - metadata
title: Dataset title: Dataset
Schema: HuggingfaceDataReference:
type: string type: object
enum: properties:
- jsonl_messages type:
title: Schema type: string
description: >- const: huggingface
Schema of the dataset. Each type has a different column format. default: huggingface
dataset_path:
type: string
params:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
additionalProperties: false
required:
- type
- dataset_path
- params
title: HuggingfaceDataReference
RowsDataReference:
type: object
properties:
type:
type: string
const: rows
default: rows
rows:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
additionalProperties: false
required:
- type
- rows
title: RowsDataReference
URIDataReference:
type: object
properties:
type:
type: string
const: uri
default: uri
uri:
type: string
additionalProperties: false
required:
- type
- uri
title: URIDataReference
Model: Model:
type: object type: object
properties: properties:
@ -6272,28 +6344,20 @@ components:
type: object type: object
properties: properties:
schema: schema:
$ref: '#/components/schemas/Schema' type: string
enum:
- jsonl_messages
description: >- description: >-
The schema format of the dataset. One of - jsonl_messages: The dataset The schema format of the dataset. One of - jsonl_messages: The dataset
is a JSONL file with messages in column format is a JSONL file with messages in column format
uri: data_reference:
type: string $ref: '#/components/schemas/DataReference'
description: >- description: >-
The URI of the dataset. Examples: - file://mydata.jsonl - s3://mybucket/myfile.jsonl The data reference of the dataset. Examples: - { "type": "uri", "uri":
- https://mywebsite.com/myfile.jsonl - huggingface://tatsu-lab/alpaca "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri": "lsfs://mydata.jsonl"
uri_params: } - { "type": "huggingface", "dataset_path": "tatsu-lab/alpaca", "params":
type: object { "split": "train" } } - { "type": "rows", "rows": [{"message": "Hello,
additionalProperties: world!"}] }
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
The parameters for the URI. - E.g. If URL is a huggingface dataset, parameters
could be uri_params={"split": "train"}
metadata: metadata:
type: object type: object
additionalProperties: additionalProperties:
@ -6313,7 +6377,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- schema - schema
- uri - data_reference
title: RegisterDatasetRequest title: RegisterDatasetRequest
RegisterModelRequest: RegisterModelRequest:
type: object type: object

View file

@ -5,12 +5,12 @@
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Protocol from typing import Any, Dict, List, Literal, Optional, Protocol, Annotated, Union
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod, register_schema
class Schema(Enum): class Schema(Enum):
@ -29,9 +29,42 @@ class Schema(Enum):
# TODO: add more schemas here # TODO: add more schemas here
class DatasetType(Enum):
huggingface = "huggingface"
uri = "uri"
rows = "rows"
@json_schema_type
class URIDataReference(BaseModel):
type: Literal["uri"] = "uri"
uri: str
@json_schema_type
class HuggingfaceDataReference(BaseModel):
type: Literal["huggingface"] = "huggingface"
dataset_path: str
params: Dict[str, Any]
@json_schema_type
class RowsDataReference(BaseModel):
type: Literal["rows"] = "rows"
rows: List[Dict[str, Any]]
DataReference = register_schema(
Annotated[
Union[URIDataReference, HuggingfaceDataReference, RowsDataReference],
Field(discriminator="type"),
],
name="DataReference",
)
class CommonDatasetFields(BaseModel): class CommonDatasetFields(BaseModel):
schema: Schema schema: Schema
uri: str data_reference: DataReference
metadata: Dict[str, Any] = Field( metadata: Dict[str, Any] = Field(
default_factory=dict, default_factory=dict,
description="Any additional metadata for this dataset", description="Any additional metadata for this dataset",
@ -66,8 +99,7 @@ class Datasets(Protocol):
async def register_dataset( async def register_dataset(
self, self,
schema: Schema, schema: Schema,
uri: str, data_reference: DataReference,
uri_params: Optional[Dict[str, Any]] = None,
metadata: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None,
dataset_id: Optional[str] = None, dataset_id: Optional[str] = None,
) -> Dataset: ) -> Dataset:
@ -76,13 +108,26 @@ class Datasets(Protocol):
:param schema: The schema format of the dataset. One of :param schema: The schema format of the dataset. One of
- jsonl_messages: The dataset is a JSONL file with messages in column format - jsonl_messages: The dataset is a JSONL file with messages in column format
:param uri: The URI of the dataset. Examples: :param data_reference: The data reference of the dataset. Examples:
- file://mydata.jsonl - {
- s3://mybucket/myfile.jsonl "type": "uri",
- https://mywebsite.com/myfile.jsonl "uri": "https://mywebsite.com/mydata.jsonl"
- huggingface://tatsu-lab/alpaca }
:param uri_params: The parameters for the URI. - {
- E.g. If URL is a huggingface dataset, parameters could be uri_params={"split": "train"} "type": "uri",
"uri": "lsfs://mydata.jsonl"
}
- {
"type": "huggingface",
"dataset_path": "tatsu-lab/alpaca",
"params": {
"split": "train"
}
}
- {
"type": "rows",
"rows": [{"message": "Hello, world!"}]
}
:param metadata: The metadata for the dataset. :param metadata: The metadata for the dataset.
- E.g. {"description": "My dataset"} - E.g. {"description": "My dataset"}
:param dataset_id: The ID of the dataset. If not provided, a random ID will be generated. :param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.