diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 4990d845e..1d1b14b4a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2115,7 +2115,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/IterrowsResponse" + "$ref": "#/components/schemas/PaginatedResponse" } } } @@ -2136,7 +2136,7 @@ "tags": [ "DatasetIO" ], - "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.", + "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set", "parameters": [ { "name": "dataset_id", @@ -8073,7 +8073,7 @@ "additionalProperties": false, "title": "ToolInvocationResult" }, - "IterrowsResponse": { + "PaginatedResponse": { "type": "object", "properties": { "data": { @@ -8103,19 +8103,20 @@ ] } }, - "description": "The rows in the current page." + "description": "The list of items for the current page" }, - "next_start_index": { - "type": "integer", - "description": "Index into dataset for the first row in the next page. None if there are no more rows." + "has_more": { + "type": "boolean", + "description": "Whether there are more items available after this set" } }, "additionalProperties": false, "required": [ - "data" + "data", + "has_more" ], - "title": "IterrowsResponse", - "description": "A paginated list of rows from a dataset." + "title": "PaginatedResponse", + "description": "A generic paginated response that follows a simple format." }, "Job": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index ba3868560..c98e1de89 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1443,7 +1443,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/IterrowsResponse' + $ref: '#/components/schemas/PaginatedResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1457,7 +1457,20 @@ paths: tags: - DatasetIO description: >- - Get a paginated list of rows from a dataset. Uses cursor-based pagination. + Get a paginated list of rows from a dataset. + + Uses offset-based pagination where: + + - start_index: The starting index (0-based). If None, starts from beginning. + + - limit: Number of items to return. If None or -1, returns all items. + + + The response includes: + + - data: List of items for the current page + + - has_more: Whether there are more items available after this set parameters: - name: dataset_id in: path @@ -5542,7 +5555,7 @@ components: - type: object additionalProperties: false title: ToolInvocationResult - IterrowsResponse: + PaginatedResponse: type: object properties: data: @@ -5557,17 +5570,18 @@ components: - type: string - type: array - type: object - description: The rows in the current page. - next_start_index: - type: integer + description: The list of items for the current page + has_more: + type: boolean description: >- - Index into dataset for the first row in the next page. None if there are - no more rows. + Whether there are more items available after this set additionalProperties: false required: - data - title: IterrowsResponse - description: A paginated list of rows from a dataset. + - has_more + title: PaginatedResponse + description: >- + A generic paginated response that follows a simple format. Job: type: object properties: diff --git a/llama_stack/apis/common/responses.py b/llama_stack/apis/common/responses.py new file mode 100644 index 000000000..f9e9a4c31 --- /dev/null +++ b/llama_stack/apis/common/responses.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, List + +from pydantic import BaseModel + +from llama_stack.schema_utils import json_schema_type + + +@json_schema_type +class PaginatedResponse(BaseModel): + """A generic paginated response that follows a simple format. + + :param data: The list of items for the current page + :param has_more: Whether there are more items available after this set + """ + + data: List[Dict[str, Any]] + has_more: bool diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py index d9d86fe1b..6331882fb 100644 --- a/llama_stack/apis/datasetio/datasetio.py +++ b/llama_stack/apis/datasetio/datasetio.py @@ -6,23 +6,9 @@ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable -from pydantic import BaseModel - +from llama_stack.apis.common.responses import PaginatedResponse from llama_stack.apis.datasets import Dataset -from llama_stack.schema_utils import json_schema_type, webmethod - - -@json_schema_type -class IterrowsResponse(BaseModel): - """ - A paginated list of rows from a dataset. - - :param data: The rows in the current page. - :param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows. - """ - - data: List[Dict[str, Any]] - next_start_index: Optional[int] = None +from llama_stack.schema_utils import webmethod class DatasetStore(Protocol): @@ -34,15 +20,22 @@ class DatasetIO(Protocol): # keeping for aligning with inference/safety, but this is not used dataset_store: DatasetStore - # TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET") async def iterrows( self, dataset_id: str, start_index: Optional[int] = None, limit: Optional[int] = None, - ) -> IterrowsResponse: - """Get a paginated list of rows from a dataset. Uses cursor-based pagination. + ) -> PaginatedResponse: + """Get a paginated list of rows from a dataset. + + Uses offset-based pagination where: + - start_index: The starting index (0-based). If None, starts from beginning. + - limit: Number of items to return. If None or -1, returns all items. + + The response includes: + - data: List of items for the current page + - has_more: Whether there are more items available after this set :param dataset_id: The ID of the dataset to get the rows from. :param start_index: Index into dataset for the first row to get. Get all rows if None. diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 6ff36a65c..53f21f9d8 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -12,7 +12,8 @@ from llama_stack.apis.common.content_types import ( InterleavedContent, InterleavedContentItem, ) -from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse +from llama_stack.apis.common.responses import PaginatedResponse +from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import DatasetPurpose, DataSource from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job from llama_stack.apis.inference import ( @@ -497,7 +498,7 @@ class DatasetIORouter(DatasetIO): dataset_id: str, start_index: Optional[int] = None, limit: Optional[int] = None, - ) -> IterrowsResponse: + ) -> PaginatedResponse: logger.debug( f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}", ) diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index f489739bf..e71107d61 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional import pandas -from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse +from llama_stack.apis.common.responses import PaginatedResponse +from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset from llama_stack.providers.datatypes import DatasetsProtocolPrivate +from llama_stack.providers.utils.datasetio.pagination import paginate_records from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri from llama_stack.providers.utils.kvstore import kvstore_impl @@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): dataset_id: str, start_index: Optional[int] = None, limit: Optional[int] = None, - ) -> IterrowsResponse: + ) -> PaginatedResponse: dataset_def = self.dataset_infos[dataset_id] dataset_impl = PandasDataframeDataset(dataset_def) await dataset_impl.load() - start_index = start_index or 0 - - if limit is None or limit == -1: - end = len(dataset_impl) - else: - end = min(start_index + limit, len(dataset_impl)) - - rows = dataset_impl[start_index:end] - - return IterrowsResponse( - data=rows, - next_start_index=end if end < len(dataset_impl) else None, - ) + records = dataset_impl.df.to_dict("records") + return paginate_records(records, start_index, limit) async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: dataset_def = self.dataset_infos[dataset_id] diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py index fe3195332..7a17e5e42 100644 --- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py +++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py @@ -8,9 +8,11 @@ from urllib.parse import parse_qs, urlparse import datasets as hf_datasets -from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse +from llama_stack.apis.common.responses import PaginatedResponse +from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset from llama_stack.providers.datatypes import DatasetsProtocolPrivate +from llama_stack.providers.utils.datasetio.pagination import paginate_records from llama_stack.providers.utils.kvstore import kvstore_impl from .config import HuggingfaceDatasetIOConfig @@ -70,24 +72,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): dataset_id: str, start_index: Optional[int] = None, limit: Optional[int] = None, - ) -> IterrowsResponse: + ) -> PaginatedResponse: dataset_def = self.dataset_infos[dataset_id] path, params = parse_hf_params(dataset_def) loaded_dataset = hf_datasets.load_dataset(path, **params) - start_index = start_index or 0 - - if limit is None or limit == -1: - end = len(loaded_dataset) - else: - end = min(start_index + limit, len(loaded_dataset)) - - rows = [loaded_dataset[i] for i in range(start_index, end)] - - return IterrowsResponse( - data=rows, - next_start_index=end if end < len(loaded_dataset) else None, - ) + records = [loaded_dataset[i] for i in range(len(loaded_dataset))] + return paginate_records(records, start_index, limit) async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: dataset_def = self.dataset_infos[dataset_id] diff --git a/llama_stack/providers/utils/datasetio/pagination.py b/llama_stack/providers/utils/datasetio/pagination.py new file mode 100644 index 000000000..1b693f8f5 --- /dev/null +++ b/llama_stack/providers/utils/datasetio/pagination.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, List + +from llama_stack.apis.common.responses import PaginatedResponse + + +def paginate_records( + records: List[Dict[str, Any]], + start_index: int | None = None, + limit: int | None = None, +) -> PaginatedResponse: + """Helper function to handle pagination of records consistently across implementations. + Inspired by stripe's pagination: https://docs.stripe.com/api/pagination + + :param records: List of records to paginate + :param start_index: The starting index (0-based). If None, starts from beginning. + :param limit: Number of items to return. If None or -1, returns all items. + :return: PaginatedResponse with the paginated data + """ + # Handle special case for fetching all rows + if limit is None or limit == -1: + return PaginatedResponse( + data=records, + has_more=False, + ) + + # Use offset-based pagination + start_index = start_index or 0 + end_index = min(start_index + limit, len(records)) + page_data = records[start_index:end_index] + + # Calculate if there are more records + has_more = end_index < len(records) + + return PaginatedResponse( + data=page_data, + has_more=has_more, + ) diff --git a/uv.lock b/uv.lock index afed997f0..7b7b34a4c 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.10" resolution-markers = [ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", @@ -1461,6 +1462,7 @@ requires-dist = [ { name = "types-setuptools", marker = "extra == 'dev'" }, { name = "uvicorn", marker = "extra == 'dev'" }, ] +provides-extras = ["dev", "unit", "test", "docs", "codegen"] [[package]] name = "llama-stack-client" @@ -2463,8 +2465,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 }, { url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 }, { url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 }, - { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 }, - { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 }, { url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 }, { url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 }, { url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 },