ifevals eleuther task

2025-12-08 03:00:56 +00:00 · 2024-10-09 16:08:07 -07:00 · 2024-10-09 16:08:07 -07:00 · 38e7740d0b
commit 38e7740d0b
parent 58992dacc2
4 changed files with 53 additions and 6 deletions
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@ -55,7 +55,8 @@ async def run_main(host: str, port: int):
    # Eleuther Eval Task
    response = await client.run_evals(
        model="Llama3.1-8B-Instruct",
-        task="meta_mmlu_pro_instruct",
+        # task="meta_mmlu_pro_instruct",
+        task="meta_ifeval",
    )
    cprint(response.metrics["metrics_table"], "red")

--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
@ -1,9 +1,9 @@
 task: meta_ifeval
-dataset_path: parquet
-dataset_kwargs:
-  data_files: ./work_dir/joined_ifeval.parquet
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
 output_type: generate_until
-test_split: train
+test_split: latest
+process_docs: !function utils.process_docs
 num_fewshot: 0
 doc_to_text: prompt
 doc_to_target: 0
--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
@ -7,6 +7,8 @@
 import dataclasses
 from typing import Dict, Optional, Union

+import datasets
+
 from lm_eval.tasks.ifeval import instructions_registry


@ -143,3 +145,47 @@ def agg_inst_level_acc(items):
    flat_items = [item for sublist in items for item in sublist]
    inst_level_acc = sum(flat_items) / len(flat_items)
    return inst_level_acc
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _get_question(example: dict) -> dict:
+        # get the question from the ifeval dataset
+        example["input_question"] = (
+            eval(
+                example["input_question"]
+                .replace("null", "None")
+                .replace("true", "True")
+                .replace("false", "False")
+            )["dialog"][0]["body"]
+            .replace("Is it True that the first song", "Is it true that the first song")
+            .replace("Is the following True", "Is the following true")
+        )
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+
+    original_dataset_name = "wis-k/instruction-following-eval"
+    ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+
+    meta_dataset = dataset.map(_get_question)
+    meta_df = meta_dataset.to_pandas()
+
+    # join the two datasets on the input_question column
+    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = datasets.Dataset.from_pandas(joined)
+    joined = joined.select_columns(
+        [
+            "input_question",
+            "prompt",
+            "previous_is_correct",
+            "instruction_id_list",
+            "kwargs",
+            "output_prediction_text",
+            "key",
+        ]
+    )
+    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
+    return joined
--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
@ -32,4 +32,4 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    )
    dataset = dataset.rename_column("is_correct", "previously_is_correct")
    dataset = dataset.map(_process_doc)
-    return dataset.map(_process_doc)
+    return dataset