hf drop rows not specified by schema

This commit is contained in:
Xi Yan 2024-12-11 17:17:17 -08:00
parent 41487e6ed1
commit e14493885b

View file

@ -21,14 +21,19 @@ DATASETS_PREFIX = "datasets:"
def load_hf_dataset(dataset_def: Dataset):
if dataset_def.metadata.get("path", None):
return hf_datasets.load_dataset(**dataset_def.metadata)
dataset = hf_datasets.load_dataset(**dataset_def.metadata)
else:
df = get_dataframe_from_url(dataset_def.url)
df = get_dataframe_from_url(dataset_def.url)
if df is None:
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
if df is None:
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
dataset = hf_datasets.Dataset.from_pandas(df)
# drop rows not specified by schema
if dataset_def.dataset_schema:
dataset = dataset.select_columns(list(dataset_def.dataset_schema.keys()))
dataset = hf_datasets.Dataset.from_pandas(df)
return dataset