diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 4990d845e..1d1b14b4a 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2115,7 +2115,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/IterrowsResponse"
+ "$ref": "#/components/schemas/PaginatedResponse"
}
}
}
@@ -2136,7 +2136,7 @@
"tags": [
"DatasetIO"
],
- "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
+ "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
"parameters": [
{
"name": "dataset_id",
@@ -8073,7 +8073,7 @@
"additionalProperties": false,
"title": "ToolInvocationResult"
},
- "IterrowsResponse": {
+ "PaginatedResponse": {
"type": "object",
"properties": {
"data": {
@@ -8103,19 +8103,20 @@
]
}
},
- "description": "The rows in the current page."
+ "description": "The list of items for the current page"
},
- "next_start_index": {
- "type": "integer",
- "description": "Index into dataset for the first row in the next page. None if there are no more rows."
+ "has_more": {
+ "type": "boolean",
+ "description": "Whether there are more items available after this set"
}
},
"additionalProperties": false,
"required": [
- "data"
+ "data",
+ "has_more"
],
- "title": "IterrowsResponse",
- "description": "A paginated list of rows from a dataset."
+ "title": "PaginatedResponse",
+ "description": "A generic paginated response that follows a simple format."
},
"Job": {
"type": "object",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index ba3868560..c98e1de89 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1443,7 +1443,7 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/IterrowsResponse'
+ $ref: '#/components/schemas/PaginatedResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -1457,7 +1457,20 @@ paths:
tags:
- DatasetIO
description: >-
- Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+ Get a paginated list of rows from a dataset.
+
+ Uses offset-based pagination where:
+
+ - start_index: The starting index (0-based). If None, starts from beginning.
+
+ - limit: Number of items to return. If None or -1, returns all items.
+
+
+ The response includes:
+
+ - data: List of items for the current page
+
+ - has_more: Whether there are more items available after this set
parameters:
- name: dataset_id
in: path
@@ -5542,7 +5555,7 @@ components:
- type: object
additionalProperties: false
title: ToolInvocationResult
- IterrowsResponse:
+ PaginatedResponse:
type: object
properties:
data:
@@ -5557,17 +5570,18 @@ components:
- type: string
- type: array
- type: object
- description: The rows in the current page.
- next_start_index:
- type: integer
+ description: The list of items for the current page
+ has_more:
+ type: boolean
description: >-
- Index into dataset for the first row in the next page. None if there are
- no more rows.
+ Whether there are more items available after this set
additionalProperties: false
required:
- data
- title: IterrowsResponse
- description: A paginated list of rows from a dataset.
+ - has_more
+ title: PaginatedResponse
+ description: >-
+ A generic paginated response that follows a simple format.
Job:
type: object
properties:
diff --git a/llama_stack/apis/common/responses.py b/llama_stack/apis/common/responses.py
new file mode 100644
index 000000000..f9e9a4c31
--- /dev/null
+++ b/llama_stack/apis/common/responses.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class PaginatedResponse(BaseModel):
+ """A generic paginated response that follows a simple format.
+
+ :param data: The list of items for the current page
+ :param has_more: Whether there are more items available after this set
+ """
+
+ data: List[Dict[str, Any]]
+ has_more: bool
diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py
index d9d86fe1b..6331882fb 100644
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@@ -6,23 +6,9 @@
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
-from pydantic import BaseModel
-
+from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.apis.datasets import Dataset
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class IterrowsResponse(BaseModel):
- """
- A paginated list of rows from a dataset.
-
- :param data: The rows in the current page.
- :param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
- """
-
- data: List[Dict[str, Any]]
- next_start_index: Optional[int] = None
+from llama_stack.schema_utils import webmethod
class DatasetStore(Protocol):
@@ -34,15 +20,22 @@ class DatasetIO(Protocol):
# keeping for aligning with inference/safety, but this is not used
dataset_store: DatasetStore
- # TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing
@webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
async def iterrows(
self,
dataset_id: str,
start_index: Optional[int] = None,
limit: Optional[int] = None,
- ) -> IterrowsResponse:
- """Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+ ) -> PaginatedResponse:
+ """Get a paginated list of rows from a dataset.
+
+ Uses offset-based pagination where:
+ - start_index: The starting index (0-based). If None, starts from beginning.
+ - limit: Number of items to return. If None or -1, returns all items.
+
+ The response includes:
+ - data: List of items for the current page
+ - has_more: Whether there are more items available after this set
:param dataset_id: The ID of the dataset to get the rows from.
:param start_index: Index into dataset for the first row to get. Get all rows if None.
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 6ff36a65c..53f21f9d8 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -12,7 +12,8 @@ from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
-from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import DatasetPurpose, DataSource
from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
from llama_stack.apis.inference import (
@@ -497,7 +498,7 @@ class DatasetIORouter(DatasetIO):
dataset_id: str,
start_index: Optional[int] = None,
limit: Optional[int] = None,
- ) -> IterrowsResponse:
+ ) -> PaginatedResponse:
logger.debug(
f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
)
diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
index f489739bf..e71107d61 100644
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional
import pandas
-from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack.providers.utils.datasetio.pagination import paginate_records
from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
from llama_stack.providers.utils.kvstore import kvstore_impl
@@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
dataset_id: str,
start_index: Optional[int] = None,
limit: Optional[int] = None,
- ) -> IterrowsResponse:
+ ) -> PaginatedResponse:
dataset_def = self.dataset_infos[dataset_id]
dataset_impl = PandasDataframeDataset(dataset_def)
await dataset_impl.load()
- start_index = start_index or 0
-
- if limit is None or limit == -1:
- end = len(dataset_impl)
- else:
- end = min(start_index + limit, len(dataset_impl))
-
- rows = dataset_impl[start_index:end]
-
- return IterrowsResponse(
- data=rows,
- next_start_index=end if end < len(dataset_impl) else None,
- )
+ records = dataset_impl.df.to_dict("records")
+ return paginate_records(records, start_index, limit)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
dataset_def = self.dataset_infos[dataset_id]
diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
index fe3195332..7a17e5e42 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@@ -8,9 +8,11 @@ from urllib.parse import parse_qs, urlparse
import datasets as hf_datasets
-from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack.providers.utils.datasetio.pagination import paginate_records
from llama_stack.providers.utils.kvstore import kvstore_impl
from .config import HuggingfaceDatasetIOConfig
@@ -70,24 +72,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
dataset_id: str,
start_index: Optional[int] = None,
limit: Optional[int] = None,
- ) -> IterrowsResponse:
+ ) -> PaginatedResponse:
dataset_def = self.dataset_infos[dataset_id]
path, params = parse_hf_params(dataset_def)
loaded_dataset = hf_datasets.load_dataset(path, **params)
- start_index = start_index or 0
-
- if limit is None or limit == -1:
- end = len(loaded_dataset)
- else:
- end = min(start_index + limit, len(loaded_dataset))
-
- rows = [loaded_dataset[i] for i in range(start_index, end)]
-
- return IterrowsResponse(
- data=rows,
- next_start_index=end if end < len(loaded_dataset) else None,
- )
+ records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
+ return paginate_records(records, start_index, limit)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
dataset_def = self.dataset_infos[dataset_id]
diff --git a/llama_stack/providers/utils/datasetio/pagination.py b/llama_stack/providers/utils/datasetio/pagination.py
new file mode 100644
index 000000000..1b693f8f5
--- /dev/null
+++ b/llama_stack/providers/utils/datasetio/pagination.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List
+
+from llama_stack.apis.common.responses import PaginatedResponse
+
+
+def paginate_records(
+ records: List[Dict[str, Any]],
+ start_index: int | None = None,
+ limit: int | None = None,
+) -> PaginatedResponse:
+ """Helper function to handle pagination of records consistently across implementations.
+ Inspired by stripe's pagination: https://docs.stripe.com/api/pagination
+
+ :param records: List of records to paginate
+ :param start_index: The starting index (0-based). If None, starts from beginning.
+ :param limit: Number of items to return. If None or -1, returns all items.
+ :return: PaginatedResponse with the paginated data
+ """
+ # Handle special case for fetching all rows
+ if limit is None or limit == -1:
+ return PaginatedResponse(
+ data=records,
+ has_more=False,
+ )
+
+ # Use offset-based pagination
+ start_index = start_index or 0
+ end_index = min(start_index + limit, len(records))
+ page_data = records[start_index:end_index]
+
+ # Calculate if there are more records
+ has_more = end_index < len(records)
+
+ return PaginatedResponse(
+ data=page_data,
+ has_more=has_more,
+ )
diff --git a/uv.lock b/uv.lock
index afed997f0..7b7b34a4c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
version = 1
+revision = 1
requires-python = ">=3.10"
resolution-markers = [
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -1461,6 +1462,7 @@ requires-dist = [
{ name = "types-setuptools", marker = "extra == 'dev'" },
{ name = "uvicorn", marker = "extra == 'dev'" },
]
+provides-extras = ["dev", "unit", "test", "docs", "codegen"]
[[package]]
name = "llama-stack-client"
@@ -2463,8 +2465,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 },
{ url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 },
{ url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 },
- { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 },
- { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 },
{ url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 },
{ url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 },
{ url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 },