diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c4430c8d0..c8c57f490 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,75 +40,7 @@
}
],
"paths": {
- "/v1/datasetio/rows": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/PaginatedRowsResult"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "DatasetIO"
- ],
- "description": "Get a paginated list of rows from a dataset.",
- "parameters": [
- {
- "name": "dataset_id",
- "in": "query",
- "description": "The ID of the dataset to get the rows from.",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "rows_in_page",
- "in": "query",
- "description": "The number of rows to get per page.",
- "required": true,
- "schema": {
- "type": "integer"
- }
- },
- {
- "name": "page_token",
- "in": "query",
- "description": "The token to get the next page of rows.",
- "required": false,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "filter_condition",
- "in": "query",
- "description": "(Optional) A condition to filter the rows by.",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ]
- },
+ "/v1/datasets/{dataset_id}/rows": {
"post": {
"responses": {
"200": {
@@ -131,7 +63,16 @@
"DatasetIO"
],
"description": "",
- "parameters": [],
+ "parameters": [
+ {
+ "name": "dataset_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
"requestBody": {
"content": {
"application/json": {
@@ -2272,6 +2213,76 @@
}
}
},
+ "/v1/datasets/{dataset_id}/iterrows": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/PaginatedRowsResult"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "DatasetIO"
+ ],
+ "description": "Get a paginated list of rows from a dataset.",
+ "parameters": [
+ {
+ "name": "dataset_id",
+ "in": "path",
+ "description": "The ID of the dataset to get the rows from.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "rows_in_page",
+ "in": "query",
+ "description": "The number of rows to get per page.",
+ "required": true,
+ "schema": {
+ "type": "integer"
+ }
+ },
+ {
+ "name": "page_token",
+ "in": "query",
+ "description": "The token to get the next page of rows.",
+ "required": false,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "filter_condition",
+ "in": "query",
+ "description": "(Optional) A condition to filter the rows by.",
+ "required": false,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
@@ -3861,9 +3872,6 @@
"AppendRowsRequest": {
"type": "object",
"properties": {
- "dataset_id": {
- "type": "string"
- },
"rows": {
"type": "array",
"items": {
@@ -3895,7 +3903,6 @@
},
"additionalProperties": false,
"required": [
- "dataset_id",
"rows"
],
"title": "AppendRowsRequest"
@@ -6755,9 +6762,6 @@
{
"$ref": "#/components/schemas/URIDataSource"
},
- {
- "$ref": "#/components/schemas/HuggingfaceDataSource"
- },
{
"$ref": "#/components/schemas/RowsDataSource"
}
@@ -6766,7 +6770,6 @@
"propertyName": "type",
"mapping": {
"uri": "#/components/schemas/URIDataSource",
- "huggingface": "#/components/schemas/HuggingfaceDataSource",
"rows": "#/components/schemas/RowsDataSource"
}
}
@@ -6842,65 +6845,6 @@
],
"title": "Dataset"
},
- "HuggingfaceDataSource": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "huggingface",
- "default": "huggingface",
- "description": "The type of the data source."
- },
- "huggingface": {
- "type": "object",
- "properties": {
- "path": {
- "type": "string",
- "description": "The path to the dataset in Huggingface. E.g. - \"llamastack/simpleqa\""
- },
- "params": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The parameters for the dataset."
- }
- },
- "additionalProperties": false,
- "required": [
- "path",
- "params"
- ],
- "description": "The fields for a Huggingface dataset."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "huggingface"
- ],
- "title": "HuggingfaceDataSource",
- "description": "A dataset stored in Huggingface."
- },
"RowsDataSource": {
"type": "object",
"properties": {
@@ -7034,56 +6978,7 @@
],
"title": "ModelType"
},
- "PaginatedRowsResult": {
- "type": "object",
- "properties": {
- "rows": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The rows in the current page."
- },
- "total_count": {
- "type": "integer",
- "description": "The total number of rows in the dataset."
- },
- "next_page_token": {
- "type": "string",
- "description": "The token to get the next page of rows."
- }
- },
- "additionalProperties": false,
- "required": [
- "rows",
- "total_count"
- ],
- "title": "PaginatedRowsResult",
- "description": "A paginated list of rows from a dataset."
- },
- "AnswerCorrectnessScoringFn": {
+ "AgentTurnInputType": {
"type": "object",
"properties": {
"type": {
@@ -8537,6 +8432,55 @@
],
"title": "ToolInvocationResult"
},
+ "PaginatedRowsResult": {
+ "type": "object",
+ "properties": {
+ "rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The rows in the current page."
+ },
+ "total_count": {
+ "type": "integer",
+ "description": "The total number of rows in the dataset."
+ },
+ "next_page_token": {
+ "type": "string",
+ "description": "The token to get the next page of rows."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "rows",
+ "total_count"
+ ],
+ "title": "PaginatedRowsResult",
+ "description": "A paginated list of rows from a dataset."
+ },
"ListAgentSessionsResponse": {
"type": "object",
"properties": {
@@ -9884,7 +9828,7 @@
},
"source": {
"$ref": "#/components/schemas/DataSource",
- "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"huggingface\": { \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
+ "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
},
"metadata": {
"type": "object",
@@ -9914,7 +9858,7 @@
},
"dataset_id": {
"type": "string",
- "description": "The ID of the dataset. If not provided, a random ID will be generated."
+ "description": "The ID of the dataset. If not provided, an ID will be generated."
}
},
"additionalProperties": false,
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index de24e41c6..24b47b1c0 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,56 +10,7 @@ info:
servers:
- url: http://any-hosted-llama-stack.com
paths:
- /v1/datasetio/rows:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/PaginatedRowsResult'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - DatasetIO
- description: >-
- Get a paginated list of rows from a dataset.
- parameters:
- - name: dataset_id
- in: query
- description: >-
- The ID of the dataset to get the rows from.
- required: true
- schema:
- type: string
- - name: rows_in_page
- in: query
- description: The number of rows to get per page.
- required: true
- schema:
- type: integer
- - name: page_token
- in: query
- description: The token to get the next page of rows.
- required: false
- schema:
- type: string
- - name: filter_condition
- in: query
- description: >-
- (Optional) A condition to filter the rows by.
- required: false
- schema:
- type: string
+ /v1/datasets/{dataset_id}/rows:
post:
responses:
'200':
@@ -77,7 +28,12 @@ paths:
tags:
- DatasetIO
description: ''
- parameters: []
+ parameters:
+ - name: dataset_id
+ in: path
+ required: true
+ schema:
+ type: string
requestBody:
content:
application/json:
@@ -1529,6 +1485,56 @@ paths:
schema:
$ref: '#/components/schemas/InvokeToolRequest'
required: true
+ /v1/datasets/{dataset_id}/iterrows:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PaginatedRowsResult'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - DatasetIO
+ description: >-
+ Get a paginated list of rows from a dataset.
+ parameters:
+ - name: dataset_id
+ in: path
+ description: >-
+ The ID of the dataset to get the rows from.
+ required: true
+ schema:
+ type: string
+ - name: rows_in_page
+ in: query
+ description: The number of rows to get per page.
+ required: true
+ schema:
+ type: integer
+ - name: page_token
+ in: query
+ description: The token to get the next page of rows.
+ required: false
+ schema:
+ type: string
+ - name: filter_condition
+ in: query
+ description: >-
+ (Optional) A condition to filter the rows by.
+ required: false
+ schema:
+ type: string
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get:
responses:
@@ -2636,8 +2642,6 @@ components:
AppendRowsRequest:
type: object
properties:
- dataset_id:
- type: string
rows:
type: array
items:
@@ -2652,7 +2656,6 @@ components:
- type: object
additionalProperties: false
required:
- - dataset_id
- rows
title: AppendRowsRequest
CompletionMessage:
@@ -4679,13 +4682,11 @@ components:
DataSource:
oneOf:
- $ref: '#/components/schemas/URIDataSource'
- - $ref: '#/components/schemas/HuggingfaceDataSource'
- $ref: '#/components/schemas/RowsDataSource'
discriminator:
propertyName: type
mapping:
uri: '#/components/schemas/URIDataSource'
- huggingface: '#/components/schemas/HuggingfaceDataSource'
rows: '#/components/schemas/RowsDataSource'
Dataset:
type: object
@@ -4734,43 +4735,6 @@ components:
- source
- metadata
title: Dataset
- HuggingfaceDataSource:
- type: object
- properties:
- type:
- type: string
- const: huggingface
- default: huggingface
- description: The type of the data source.
- huggingface:
- type: object
- properties:
- path:
- type: string
- description: >-
- The path to the dataset in Huggingface. E.g. - "llamastack/simpleqa"
- params:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The parameters for the dataset.
- additionalProperties: false
- required:
- - path
- - params
- description: The fields for a Huggingface dataset.
- additionalProperties: false
- required:
- - type
- - huggingface
- title: HuggingfaceDataSource
- description: A dataset stored in Huggingface.
RowsDataSource:
type: object
properties:
@@ -4860,35 +4824,7 @@ components:
- llm
- embedding
title: ModelType
- PaginatedRowsResult:
- type: object
- properties:
- rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The rows in the current page.
- total_count:
- type: integer
- description: The total number of rows in the dataset.
- next_page_token:
- type: string
- description: The token to get the next page of rows.
- additionalProperties: false
- required:
- - rows
- - total_count
- title: PaginatedRowsResult
- description: A paginated list of rows from a dataset.
- AnswerCorrectnessScoringFn:
+ AgentTurnInputType:
type: object
properties:
type:
@@ -5883,6 +5819,34 @@ components:
required:
- content
title: ToolInvocationResult
+ PaginatedRowsResult:
+ type: object
+ properties:
+ rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: The rows in the current page.
+ total_count:
+ type: integer
+ description: The total number of rows in the dataset.
+ next_page_token:
+ type: string
+ description: The token to get the next page of rows.
+ additionalProperties: false
+ required:
+ - rows
+ - total_count
+ title: PaginatedRowsResult
+ description: A paginated list of rows from a dataset.
ListAgentSessionsResponse:
type: object
properties:
@@ -6804,11 +6768,11 @@ components:
$ref: '#/components/schemas/DataSource'
description: >-
The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
- } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface",
- "huggingface": { "dataset_path": "tatsu-lab/alpaca", "params": { "split":
- "train" } } } - { "type": "rows", "rows": [ { "messages": [ {"role": "user",
- "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
- world!"}, ] } ] }
+ } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "uri",
+ "uri": "data:csv;base64,{base64_content}" } - { "type": "uri", "uri":
+ "huggingface://llamastack/simpleqa?split=train" } - { "type": "rows",
+ "rows": [ { "messages": [ {"role": "user", "content": "Hello, world!"},
+ {"role": "assistant", "content": "Hello, world!"}, ] } ] }
metadata:
type: object
additionalProperties:
@@ -6824,7 +6788,7 @@ components:
dataset_id:
type: string
description: >-
- The ID of the dataset. If not provided, a random ID will be generated.
+ The ID of the dataset. If not provided, an ID will be generated.
additionalProperties: false
required:
- purpose
diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py
index 6a04a6329..2c6c8e981 100644
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@@ -37,8 +37,8 @@ class DatasetIO(Protocol):
# keeping for aligning with inference/safety, but this is not used
dataset_store: DatasetStore
- @webmethod(route="/datasetio/rows", method="GET")
- async def get_rows_paginated(
+ @webmethod(route="/datasets/{dataset_id}/iterrows", method="GET")
+ async def iterrows(
self,
dataset_id: str,
rows_in_page: int,
@@ -54,5 +54,7 @@ class DatasetIO(Protocol):
"""
...
- @webmethod(route="/datasetio/rows", method="POST")
- async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
+ @webmethod(route="/datasets/{dataset_id}/rows", method="POST")
+ async def append_rows(
+ self, dataset_id: str, rows: List[Dict[str, Any]]
+ ) -> None: ...
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index e0ac542fa..8606d8bb2 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -60,6 +60,7 @@ class DatasetPurpose(Enum):
"answer": "Paris"
}
"""
+
post_training_messages = "post-training/messages"
eval_question_answer = "eval/question-answer"
@@ -75,11 +76,10 @@ class DatasetPurpose(Enum):
class DatasetType(Enum):
"""
Type of the dataset source.
- :cvar huggingface: The dataset is stored in Huggingface.
- :cvar uri: The dataset can be obtained from a URI.
- :cvar rows: The dataset is stored in rows.
+ :cvar uri: The dataset can be obtained from a URI.
+ :cvar rows: The dataset is stored in rows.
"""
- huggingface = "huggingface"
+
uri = "uri"
rows = "rows"
@@ -92,30 +92,11 @@ class URIDataSource(BaseModel):
- "lsfs://mydata.jsonl"
- "data:csv;base64,{base64_content}"
"""
+
type: Literal["uri"] = "uri"
uri: str
-class HuggingfaceDataSourceFields(BaseModel):
- """The fields for a Huggingface dataset.
- :param path: The path to the dataset in Huggingface. E.g.
- - "llamastack/simpleqa"
- :param params: The parameters for the dataset.
- """
- path: str
- params: Dict[str, Any]
-
-
-@json_schema_type
-class HuggingfaceDataSource(BaseModel):
- """A dataset stored in Huggingface.
- :param type: The type of the data source.
- :param huggingface: The fields for a Huggingface dataset.
- """
- type: Literal["huggingface"] = "huggingface"
- huggingface: HuggingfaceDataSourceFields
-
-
@json_schema_type
class RowsDataSource(BaseModel):
"""A dataset stored in rows.
@@ -124,13 +105,14 @@ class RowsDataSource(BaseModel):
{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
]
"""
+
type: Literal["rows"] = "rows"
rows: List[Dict[str, Any]]
DataSource = register_schema(
Annotated[
- Union[URIDataSource, HuggingfaceDataSource, RowsDataSource],
+ Union[URIDataSource, RowsDataSource],
Field(discriminator="type"),
],
name="DataSource",
@@ -141,6 +123,7 @@ class CommonDatasetFields(BaseModel):
"""
Common fields for a dataset.
"""
+
purpose: DatasetPurpose
source: DataSource
metadata: Dict[str, Any] = Field(
@@ -237,13 +220,12 @@ class Datasets(Protocol):
"uri": "lsfs://mydata.jsonl"
}
- {
- "type": "huggingface",
- "huggingface": {
- "dataset_path": "tatsu-lab/alpaca",
- "params": {
- "split": "train"
- }
- }
+ "type": "uri",
+ "uri": "data:csv;base64,{base64_content}"
+ }
+ - {
+ "type": "uri",
+ "uri": "huggingface://llamastack/simpleqa?split=train"
}
- {
"type": "rows",
@@ -258,7 +240,7 @@ class Datasets(Protocol):
}
:param metadata: The metadata for the dataset.
- E.g. {"description": "My dataset"}
- :param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.
+ :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
"""
...