mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 09:21:45 +00:00
hf drop rows not specified by schema
This commit is contained in:
parent
41487e6ed1
commit
e14493885b
1 changed files with 10 additions and 5 deletions
|
@ -21,14 +21,19 @@ DATASETS_PREFIX = "datasets:"
|
||||||
|
|
||||||
def load_hf_dataset(dataset_def: Dataset):
|
def load_hf_dataset(dataset_def: Dataset):
|
||||||
if dataset_def.metadata.get("path", None):
|
if dataset_def.metadata.get("path", None):
|
||||||
return hf_datasets.load_dataset(**dataset_def.metadata)
|
dataset = hf_datasets.load_dataset(**dataset_def.metadata)
|
||||||
|
else:
|
||||||
df = get_dataframe_from_url(dataset_def.url)
|
df = get_dataframe_from_url(dataset_def.url)
|
||||||
|
|
||||||
if df is None:
|
if df is None:
|
||||||
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
|
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
|
||||||
|
|
||||||
dataset = hf_datasets.Dataset.from_pandas(df)
|
dataset = hf_datasets.Dataset.from_pandas(df)
|
||||||
|
|
||||||
|
# drop rows not specified by schema
|
||||||
|
if dataset_def.dataset_schema:
|
||||||
|
dataset = dataset.select_columns(list(dataset_def.dataset_schema.keys()))
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue