diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 3c9ba3bca..93c15b7ed 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -55,7 +55,8 @@ async def run_main(host: str, port: int): # Eleuther Eval Task response = await client.run_evals( model="Llama3.1-8B-Instruct", - task="meta_mmlu_pro_instruct", + # task="meta_mmlu_pro_instruct", + task="meta_ifeval", ) cprint(response.metrics["metrics_table"], "red") diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml index c7196d16d..e10277a31 100644 --- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml @@ -1,9 +1,9 @@ task: meta_ifeval -dataset_path: parquet -dataset_kwargs: - data_files: ./work_dir/joined_ifeval.parquet +dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals +dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details output_type: generate_until -test_split: train +test_split: latest +process_docs: !function utils.process_docs num_fewshot: 0 doc_to_text: prompt doc_to_target: 0 diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py index 5c7c92494..aa171343f 100644 --- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py @@ -7,6 +7,8 @@ import dataclasses from typing import Dict, Optional, Union +import datasets + from lm_eval.tasks.ifeval import instructions_registry @@ -143,3 +145,47 @@ def agg_inst_level_acc(items): flat_items = [item for sublist in items for item in sublist] inst_level_acc = sum(flat_items) / len(flat_items) return inst_level_acc + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _get_question(example: dict) -> dict: + # get the question from the ifeval dataset + example["input_question"] = ( + eval( + example["input_question"] + .replace("null", "None") + .replace("true", "True") + .replace("false", "False") + )["dialog"][0]["body"] + .replace("Is it True that the first song", "Is it true that the first song") + .replace("Is the following True", "Is the following true") + ) + example["input_final_prompts"] = example["input_final_prompts"][0] + return example + + original_dataset_name = "wis-k/instruction-following-eval" + ifeval_data = datasets.load_dataset(original_dataset_name, split="train") + ifeval_df = ifeval_data.to_pandas() + ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"}) + + meta_dataset = dataset.map(_get_question) + meta_df = meta_dataset.to_pandas() + + # join the two datasets on the input_question column + joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question") + joined = joined.rename(columns={"input_final_prompts": "prompt"}) + joined = joined.rename(columns={"is_correct": "previous_is_correct"}) + joined = datasets.Dataset.from_pandas(joined) + joined = joined.select_columns( + [ + "input_question", + "prompt", + "previous_is_correct", + "instruction_id_list", + "kwargs", + "output_prediction_text", + "key", + ] + ) + joined.rename_column("output_prediction_text", "previous_output_prediction_text") + return joined diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py index e25717e98..6b8bc3e5b 100644 --- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py @@ -32,4 +32,4 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: ) dataset = dataset.rename_column("is_correct", "previously_is_correct") dataset = dataset.map(_process_doc) - return dataset.map(_process_doc) + return dataset