refactor: extract pagination logic into shared helper function (#1770)

# What does this PR do?

Move pagination logic from LocalFS and HuggingFace implementations into
a common helper function to ensure consistent pagination behavior across
providers. This reduces code duplication and centralizes pagination
logic in one place.


## Test Plan

Run this script:

```
from llama_stack_client import LlamaStackClient

# Initialize the client
client = LlamaStackClient(base_url="http://localhost:8321")

# Register a dataset
response = client.datasets.register(
    purpose="eval/messages-answer",  # or "eval/question-answer" or "post-training/messages"
    source={"type": "uri", "uri": "huggingface://datasets/llamastack/simpleqa?split=train"},
    dataset_id="my_dataset",  # optional, will be auto-generated if not provided
    metadata={"description": "My evaluation dataset"},  # optional
)

# Verify the dataset was registered by listing all datasets
datasets = client.datasets.list()
print(f"Registered datasets: {[d.identifier for d in datasets]}")

# You can then access the data using the datasetio API
# rows = client.datasets.iterrows(dataset_id="my_dataset", start_index=1, limit=2)
rows = client.datasets.iterrows(dataset_id="my_dataset")
print(f"Data: {rows.data}")
```

And play with `start_index` and `limit`.

[//]: # (## Documentation)

Signed-off-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
Sébastien Han 2025-03-31 22:08:29 +02:00 committed by GitHub
parent d495922949
commit 2ffa2b77ed
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 130 additions and 73 deletions

View file

@ -8,9 +8,11 @@ from urllib.parse import parse_qs, urlparse
import datasets as hf_datasets
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
from llama_stack.providers.utils.datasetio.pagination import paginate_records
from llama_stack.providers.utils.kvstore import kvstore_impl
from .config import HuggingfaceDatasetIOConfig
@ -70,24 +72,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
dataset_id: str,
start_index: Optional[int] = None,
limit: Optional[int] = None,
) -> IterrowsResponse:
) -> PaginatedResponse:
dataset_def = self.dataset_infos[dataset_id]
path, params = parse_hf_params(dataset_def)
loaded_dataset = hf_datasets.load_dataset(path, **params)
start_index = start_index or 0
if limit is None or limit == -1:
end = len(loaded_dataset)
else:
end = min(start_index + limit, len(loaded_dataset))
rows = [loaded_dataset[i] for i in range(start_index, end)]
return IterrowsResponse(
data=rows,
next_start_index=end if end < len(loaded_dataset) else None,
)
records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
return paginate_records(records, start_index, limit)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
dataset_def = self.dataset_infos[dataset_id]