ifevals eleuther task

2025-12-10 11:39:47 +00:00 · 2024-10-09 16:08:07 -07:00 · 2024-10-09 16:08:07 -07:00 · 38e7740d0b
commit 38e7740d0b
parent 58992dacc2
4 changed files with 53 additions and 6 deletions
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@ -55,7 +55,8 @@ async def run_main(host: str, port: int):
    # Eleuther Eval Task
    response = await client.run_evals(
        model="Llama3.1-8B-Instruct",
-        task="meta_mmlu_pro_instruct",
+        # task="meta_mmlu_pro_instruct",
        task="meta_ifeval",
    )
    cprint(response.metrics["metrics_table"], "red")
--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
@ -1,9 +1,9 @@
 task: meta_ifeval
-dataset_path: parquet
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
-dataset_kwargs:
+dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
  data_files: ./work_dir/joined_ifeval.parquet
 output_type: generate_until
-test_split: train
+test_split: latest
 process_docs: !function utils.process_docs
 num_fewshot: 0
 doc_to_text: prompt
 doc_to_target: 0
--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
@ -7,6 +7,8 @@
 import dataclasses
 from typing import Dict, Optional, Union
 import datasets
 from lm_eval.tasks.ifeval import instructions_registry
@ -143,3 +145,47 @@ def agg_inst_level_acc(items):
    flat_items = [item for sublist in items for item in sublist]
    inst_level_acc = sum(flat_items) / len(flat_items)
    return inst_level_acc
 def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    def _get_question(example: dict) -> dict:
        # get the question from the ifeval dataset
        example["input_question"] = (
            eval(
                example["input_question"]
                .replace("null", "None")
                .replace("true", "True")
                .replace("false", "False")
            )["dialog"][0]["body"]
            .replace("Is it True that the first song", "Is it true that the first song")
            .replace("Is the following True", "Is the following true")
        )
        example["input_final_prompts"] = example["input_final_prompts"][0]
        return example
    original_dataset_name = "wis-k/instruction-following-eval"
    ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
    ifeval_df = ifeval_data.to_pandas()
    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
    meta_dataset = dataset.map(_get_question)
    meta_df = meta_dataset.to_pandas()
    # join the two datasets on the input_question column
    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
    joined = joined.rename(columns={"input_final_prompts": "prompt"})
    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
    joined = datasets.Dataset.from_pandas(joined)
    joined = joined.select_columns(
        [
            "input_question",
            "prompt",
            "previous_is_correct",
            "instruction_id_list",
            "kwargs",
            "output_prediction_text",
            "key",
        ]
    )
    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
    return joined
--- a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
@ -32,4 +32,4 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    )
    dataset = dataset.rename_column("is_correct", "previously_is_correct")
    dataset = dataset.map(_process_doc)
-    return dataset.map(_process_doc)
+    return dataset