ifevals eleuther task

This commit is contained in:
Xi Yan 2024-10-09 16:08:07 -07:00
parent 58992dacc2
commit 38e7740d0b
4 changed files with 53 additions and 6 deletions

View file

@ -55,7 +55,8 @@ async def run_main(host: str, port: int):
# Eleuther Eval Task # Eleuther Eval Task
response = await client.run_evals( response = await client.run_evals(
model="Llama3.1-8B-Instruct", model="Llama3.1-8B-Instruct",
task="meta_mmlu_pro_instruct", # task="meta_mmlu_pro_instruct",
task="meta_ifeval",
) )
cprint(response.metrics["metrics_table"], "red") cprint(response.metrics["metrics_table"], "red")

View file

@ -1,9 +1,9 @@
task: meta_ifeval task: meta_ifeval
dataset_path: parquet dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
dataset_kwargs: dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
data_files: ./work_dir/joined_ifeval.parquet
output_type: generate_until output_type: generate_until
test_split: train test_split: latest
process_docs: !function utils.process_docs
num_fewshot: 0 num_fewshot: 0
doc_to_text: prompt doc_to_text: prompt
doc_to_target: 0 doc_to_target: 0

View file

@ -7,6 +7,8 @@
import dataclasses import dataclasses
from typing import Dict, Optional, Union from typing import Dict, Optional, Union
import datasets
from lm_eval.tasks.ifeval import instructions_registry from lm_eval.tasks.ifeval import instructions_registry
@ -143,3 +145,47 @@ def agg_inst_level_acc(items):
flat_items = [item for sublist in items for item in sublist] flat_items = [item for sublist in items for item in sublist]
inst_level_acc = sum(flat_items) / len(flat_items) inst_level_acc = sum(flat_items) / len(flat_items)
return inst_level_acc return inst_level_acc
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _get_question(example: dict) -> dict:
# get the question from the ifeval dataset
example["input_question"] = (
eval(
example["input_question"]
.replace("null", "None")
.replace("true", "True")
.replace("false", "False")
)["dialog"][0]["body"]
.replace("Is it True that the first song", "Is it true that the first song")
.replace("Is the following True", "Is the following true")
)
example["input_final_prompts"] = example["input_final_prompts"][0]
return example
original_dataset_name = "wis-k/instruction-following-eval"
ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
ifeval_df = ifeval_data.to_pandas()
ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
meta_dataset = dataset.map(_get_question)
meta_df = meta_dataset.to_pandas()
# join the two datasets on the input_question column
joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
joined = joined.rename(columns={"input_final_prompts": "prompt"})
joined = joined.rename(columns={"is_correct": "previous_is_correct"})
joined = datasets.Dataset.from_pandas(joined)
joined = joined.select_columns(
[
"input_question",
"prompt",
"previous_is_correct",
"instruction_id_list",
"kwargs",
"output_prediction_text",
"key",
]
)
joined.rename_column("output_prediction_text", "previous_output_prediction_text")
return joined

View file

@ -32,4 +32,4 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
) )
dataset = dataset.rename_column("is_correct", "previously_is_correct") dataset = dataset.rename_column("is_correct", "previously_is_correct")
dataset = dataset.map(_process_doc) dataset = dataset.map(_process_doc)
return dataset.map(_process_doc) return dataset