next_index -> next_start_index

This commit is contained in:
Xi Yan 2025-03-17 14:14:05 -07:00
parent d9264a0925
commit 6a8bd19ba2
5 changed files with 22 additions and 12 deletions

View file

@ -8160,7 +8160,7 @@
},
"description": "The rows in the current page."
},
"next_index": {
"next_start_index": {
"type": "integer",
"description": "Index into dataset for the first row in the next page. None if there are no more rows."
}

View file

@ -5557,7 +5557,7 @@ components:
- type: array
- type: object
description: The rows in the current page.
next_index:
next_start_index:
type: integer
description: >-
Index into dataset for the first row in the next page. None if there are

View file

@ -18,11 +18,11 @@ class IterrowsResponse(BaseModel):
A paginated list of rows from a dataset.
:param data: The rows in the current page.
:param next_index: Index into dataset for the first row in the next page. None if there are no more rows.
:param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
"""
data: List[Dict[str, Any]]
next_index: Optional[int] = None
next_start_index: Optional[int] = None
class DatasetStore(Protocol):
@ -46,9 +46,11 @@ class DatasetIO(Protocol):
:param dataset_id: The ID of the dataset to get the rows from.
:param start_index: Index into dataset for the first row to get. Get all rows if None.
:param limit: The number of rows to get per page.
:param limit: The number of rows to get.
"""
...
@webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
async def append_rows(
self, dataset_id: str, rows: List[Dict[str, Any]]
) -> None: ...

View file

@ -44,7 +44,9 @@ class PandasDataframeDataset:
elif self.dataset_def.source.type == "rows":
self.df = pandas.DataFrame(self.dataset_def.source.rows)
else:
raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")
raise ValueError(
f"Unsupported dataset source type: {self.dataset_def.source.type}"
)
if self.df is None:
raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
@ -108,7 +110,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
return IterrowsResponse(
data=rows,
next_index=end if end < len(dataset_impl) else None,
next_start_index=end if end < len(dataset_impl) else None,
)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
@ -117,4 +119,6 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
dataset_impl.load()
new_rows_df = pandas.DataFrame(rows)
dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
dataset_impl.df = pandas.concat(
[dataset_impl.df, new_rows_df], ignore_index=True
)

View file

@ -86,7 +86,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
return IterrowsResponse(
data=rows,
next_index=end if end < len(loaded_dataset) else None,
next_start_index=end if end < len(loaded_dataset) else None,
)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
@ -98,9 +98,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
new_dataset = hf_datasets.Dataset.from_list(rows)
# Concatenate the new rows with existing dataset
updated_dataset = hf_datasets.concatenate_datasets([loaded_dataset, new_dataset])
updated_dataset = hf_datasets.concatenate_datasets(
[loaded_dataset, new_dataset]
)
if dataset_def.metadata.get("path", None):
updated_dataset.push_to_hub(dataset_def.metadata["path"])
else:
raise NotImplementedError("Uploading to URL-based datasets is not supported yet")
raise NotImplementedError(
"Uploading to URL-based datasets is not supported yet"
)