mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
refactor: extract pagination logic into shared helper function (#1770)
# What does this PR do? Move pagination logic from LocalFS and HuggingFace implementations into a common helper function to ensure consistent pagination behavior across providers. This reduces code duplication and centralizes pagination logic in one place. ## Test Plan Run this script: ``` from llama_stack_client import LlamaStackClient # Initialize the client client = LlamaStackClient(base_url="http://localhost:8321") # Register a dataset response = client.datasets.register( purpose="eval/messages-answer", # or "eval/question-answer" or "post-training/messages" source={"type": "uri", "uri": "huggingface://datasets/llamastack/simpleqa?split=train"}, dataset_id="my_dataset", # optional, will be auto-generated if not provided metadata={"description": "My evaluation dataset"}, # optional ) # Verify the dataset was registered by listing all datasets datasets = client.datasets.list() print(f"Registered datasets: {[d.identifier for d in datasets]}") # You can then access the data using the datasetio API # rows = client.datasets.iterrows(dataset_id="my_dataset", start_index=1, limit=2) rows = client.datasets.iterrows(dataset_id="my_dataset") print(f"Data: {rows.data}") ``` And play with `start_index` and `limit`. [//]: # (## Documentation) Signed-off-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
parent
d495922949
commit
2ffa2b77ed
9 changed files with 130 additions and 73 deletions
|
@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional
|
|||
|
||||
import pandas
|
||||
|
||||
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
|
||||
from llama_stack.apis.common.responses import PaginatedResponse
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Dataset
|
||||
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
|
||||
from llama_stack.providers.utils.datasetio.pagination import paginate_records
|
||||
from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
|
||||
from llama_stack.providers.utils.kvstore import kvstore_impl
|
||||
|
||||
|
@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
|
|||
dataset_id: str,
|
||||
start_index: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> IterrowsResponse:
|
||||
) -> PaginatedResponse:
|
||||
dataset_def = self.dataset_infos[dataset_id]
|
||||
dataset_impl = PandasDataframeDataset(dataset_def)
|
||||
await dataset_impl.load()
|
||||
|
||||
start_index = start_index or 0
|
||||
|
||||
if limit is None or limit == -1:
|
||||
end = len(dataset_impl)
|
||||
else:
|
||||
end = min(start_index + limit, len(dataset_impl))
|
||||
|
||||
rows = dataset_impl[start_index:end]
|
||||
|
||||
return IterrowsResponse(
|
||||
data=rows,
|
||||
next_start_index=end if end < len(dataset_impl) else None,
|
||||
)
|
||||
records = dataset_impl.df.to_dict("records")
|
||||
return paginate_records(records, start_index, limit)
|
||||
|
||||
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
|
||||
dataset_def = self.dataset_infos[dataset_id]
|
||||
|
|
|
@ -8,9 +8,11 @@ from urllib.parse import parse_qs, urlparse
|
|||
|
||||
import datasets as hf_datasets
|
||||
|
||||
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
|
||||
from llama_stack.apis.common.responses import PaginatedResponse
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Dataset
|
||||
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
|
||||
from llama_stack.providers.utils.datasetio.pagination import paginate_records
|
||||
from llama_stack.providers.utils.kvstore import kvstore_impl
|
||||
|
||||
from .config import HuggingfaceDatasetIOConfig
|
||||
|
@ -70,24 +72,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
|
|||
dataset_id: str,
|
||||
start_index: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> IterrowsResponse:
|
||||
) -> PaginatedResponse:
|
||||
dataset_def = self.dataset_infos[dataset_id]
|
||||
path, params = parse_hf_params(dataset_def)
|
||||
loaded_dataset = hf_datasets.load_dataset(path, **params)
|
||||
|
||||
start_index = start_index or 0
|
||||
|
||||
if limit is None or limit == -1:
|
||||
end = len(loaded_dataset)
|
||||
else:
|
||||
end = min(start_index + limit, len(loaded_dataset))
|
||||
|
||||
rows = [loaded_dataset[i] for i in range(start_index, end)]
|
||||
|
||||
return IterrowsResponse(
|
||||
data=rows,
|
||||
next_start_index=end if end < len(loaded_dataset) else None,
|
||||
)
|
||||
records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
|
||||
return paginate_records(records, start_index, limit)
|
||||
|
||||
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
|
||||
dataset_def = self.dataset_infos[dataset_id]
|
||||
|
|
43
llama_stack/providers/utils/datasetio/pagination.py
Normal file
43
llama_stack/providers/utils/datasetio/pagination.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from llama_stack.apis.common.responses import PaginatedResponse
|
||||
|
||||
|
||||
def paginate_records(
|
||||
records: List[Dict[str, Any]],
|
||||
start_index: int | None = None,
|
||||
limit: int | None = None,
|
||||
) -> PaginatedResponse:
|
||||
"""Helper function to handle pagination of records consistently across implementations.
|
||||
Inspired by stripe's pagination: https://docs.stripe.com/api/pagination
|
||||
|
||||
:param records: List of records to paginate
|
||||
:param start_index: The starting index (0-based). If None, starts from beginning.
|
||||
:param limit: Number of items to return. If None or -1, returns all items.
|
||||
:return: PaginatedResponse with the paginated data
|
||||
"""
|
||||
# Handle special case for fetching all rows
|
||||
if limit is None or limit == -1:
|
||||
return PaginatedResponse(
|
||||
data=records,
|
||||
has_more=False,
|
||||
)
|
||||
|
||||
# Use offset-based pagination
|
||||
start_index = start_index or 0
|
||||
end_index = min(start_index + limit, len(records))
|
||||
page_data = records[start_index:end_index]
|
||||
|
||||
# Calculate if there are more records
|
||||
has_more = end_index < len(records)
|
||||
|
||||
return PaginatedResponse(
|
||||
data=page_data,
|
||||
has_more=has_more,
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue