mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 08:44:44 +00:00
hf drop rows not specified by schema
This commit is contained in:
parent
41487e6ed1
commit
e14493885b
1 changed files with 10 additions and 5 deletions
|
@ -21,14 +21,19 @@ DATASETS_PREFIX = "datasets:"
|
|||
|
||||
def load_hf_dataset(dataset_def: Dataset):
|
||||
if dataset_def.metadata.get("path", None):
|
||||
return hf_datasets.load_dataset(**dataset_def.metadata)
|
||||
dataset = hf_datasets.load_dataset(**dataset_def.metadata)
|
||||
else:
|
||||
df = get_dataframe_from_url(dataset_def.url)
|
||||
|
||||
df = get_dataframe_from_url(dataset_def.url)
|
||||
if df is None:
|
||||
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
|
||||
|
||||
if df is None:
|
||||
raise ValueError(f"Failed to load dataset from {dataset_def.url}")
|
||||
dataset = hf_datasets.Dataset.from_pandas(df)
|
||||
|
||||
# drop rows not specified by schema
|
||||
if dataset_def.dataset_schema:
|
||||
dataset = dataset.select_columns(list(dataset_def.dataset_schema.keys()))
|
||||
|
||||
dataset = hf_datasets.Dataset.from_pandas(df)
|
||||
return dataset
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue