feat: [New Eval Benchamark] IfEval (#1708)

# What does this PR do? In this PR, we added a new eval open benchmark IfEval based on paper https://arxiv.org/abs/2311.07911 to measure the model capability of instruction following. ## Test Plan spin up a llama stack server with open-benchmark template run `llama-stack-client --endpoint xxx eval run-benchmark "meta-reference-ifeval" --model-id "meta-llama/Llama-3.3-70B-Instruct" --output-dir "/home/markchen1015/" --num-examples 20` on client side and get the eval aggregate results
2025-03-19 16:39:59 -07:00 · 2025-03-19 16:39:59 -07:00 · f369871083
commit f369871083
parent a7008dc15d
13 changed files with 3520 additions and 1 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -52,6 +52,7 @@ jobs:
          # always test against the latest version of the client
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
          uv pip install -e .
+          llama stack build --template ollama --image-type venv

      - name: Wait for Ollama to start
        run: |
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -7,10 +7,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -23,6 +25,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -41,10 +44,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -56,6 +61,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -75,10 +81,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -91,6 +99,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -112,11 +121,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
+    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -128,6 +139,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -147,10 +159,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
+    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -164,6 +178,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -184,11 +199,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -201,6 +218,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -219,10 +237,12 @@
    "blobfile",
    "chardet",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "litellm",
    "matplotlib",
    "nltk",
@ -235,6 +255,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -253,11 +274,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -270,6 +293,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -288,11 +312,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -305,6 +331,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -325,11 +352,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -343,6 +372,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -365,12 +395,14 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "httpx",
+    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -384,6 +416,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -403,10 +436,12 @@
    "aiosqlite",
    "blobfile",
    "chardet",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -418,6 +453,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -436,10 +472,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -453,6 +491,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -470,9 +509,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -486,6 +527,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -505,10 +547,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -521,6 +565,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -540,10 +585,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -556,6 +603,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -605,11 +653,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -622,6 +672,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -641,10 +692,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -657,6 +710,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -677,10 +731,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
+    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
+    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -693,6 +749,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
+    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6268,6 +6268,7 @@
                "type": "string",
                "enum": [
                    "average",
+                    "weighted_average",
                    "median",
                    "categorical_count",
                    "accuracy"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4389,6 +4389,7 @@ components:
      type: string
      enum:
        - average
+        - weighted_average
        - median
        - categorical_count
        - accuracy
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum):
@json_schema_type
 class AggregationFunctionType(Enum):
    average = "average"
+    weighted_average = "weighted_average"
    median = "median"
    categorical_count = "categorical_count"
    accuracy = "accuracy"
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -25,6 +25,7 @@ from .config import BasicScoringConfig
 from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
+from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
 from .scoring_fn.regex_parser_math_response_scoring_fn import (
    RegexParserMathResponseScoringFn,
 )
@ -37,6 +38,7 @@ FIXED_FNS = [
    RegexParserScoringFn,
    RegexParserMathResponseScoringFn,
    BFCLScoringFn,
+    IfEvalScoringFn,
    DocVQAScoringFn,
 ]

--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+ifeval = ScoringFn(
+    identifier="basic::ifeval",
+    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="ifeval",
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.weighted_average],
+    ),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
+from .fn_defs.ifeval import (
+    ifeval,
+)
+
+
+class IfEvalScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn Instruction-Following Eval (IFEval) benchmark
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            ifeval.identifier: ifeval,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        if scoring_params is not None:
+            fn_def.params = scoring_params
+
+        instruction_list = input_row["instruction_id_list"]
+        generated_answer = input_row["generated_answer"].strip()
+
+        is_following_list = []
+        results = dict(
+            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
+            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
+        )
+
+        for index, instruction_id in enumerate(instruction_list):
+            instruction_cls = INSTRUCTION_DICT[instruction_id]
+            instruction = instruction_cls(instruction_id)
+            results[instruction_id + "_total"] += 1.0
+            results[instruction_id.split(":")[0] + "_total"] += 1.0
+
+            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
+            print(clean_input_row)
+            instruction.build_description(**clean_input_row)
+            args = instruction.get_instruction_args()
+            if args and "prompt" in args:
+                instruction.build_description(prompt=input_row["prompt"])
+
+            if generated_answer and instruction.check_following(generated_answer):
+                is_following_list.append(True)
+                results[instruction_id + "_correct"] += 1.0
+                results[instruction_id.split(":")[0] + "_correct"] += 1.0
+            else:
+                is_following_list.append(False)
+
+        if len(is_following_list) == 0:
+            return {
+                "score": 0.0,
+                "weight": 0.0,
+            }
+
+        return {
+            "score": float(sum(is_following_list)) / float(len(is_following_list)),
+            "weight": float(len(is_following_list)),
+        }
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
        InlineProviderSpec(
            api=Api.eval,
            provider_type="inline::meta-reference",
-            pip_packages=["tree_sitter"],
+            pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"],
            module="llama_stack.providers.inline.eval.meta_reference",
            config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
            api_dependencies=[
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
    }


+def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+    return {
+        "weighted_average": sum(
+            result["score"] * result["weight"]
+            for result in scoring_results
+            if result["score"] is not None and result["weight"] is not None
+        )
+        / sum(result["weight"] for result in scoring_results if result["weight"] is not None),
+    }
+
+
 def aggregate_categorical_count(
    scoring_results: List[ScoringResultRow],
 ) -> Dict[str, Any]:
@ -46,6 +57,7 @@ def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
 AGGREGATION_FUNCTIONS = {
    AggregationFunctionType.accuracy: aggregate_accuracy,
    AggregationFunctionType.average: aggregate_average,
+    AggregationFunctionType.weighted_average: aggregate_weighted_average,
    AggregationFunctionType.categorical_count: aggregate_categorical_count,
    AggregationFunctionType.median: aggregate_median,
 }
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -203,6 +203,13 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
            ),
        ),
+        DatasetInput(
+            dataset_id="ifeval",
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://datasets/llamastack/IfEval?split=train",
+            ),
+        ),
        DatasetInput(
            dataset_id="docvqa",
            purpose=DatasetPurpose.eval_messages_answer,
@ -238,6 +245,11 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="bfcl",
            scoring_functions=["basic::bfcl"],
        ),
+        BenchmarkInput(
+            benchmark_id="meta-reference-ifeval",
+            dataset_id="ifeval",
+            scoring_functions=["basic::ifeval"],
+        ),
        BenchmarkInput(
            benchmark_id="meta-reference-docvqa",
            dataset_id="docvqa",
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -188,6 +188,12 @@ datasets:
    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
  metadata: {}
  dataset_id: bfcl
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://datasets/llamastack/IfEval?split=train
+  metadata: {}
+  dataset_id: ifeval
 - purpose: eval/messages-answer
  source:
    type: uri
@ -221,6 +227,11 @@ benchmarks:
  - basic::bfcl
  metadata: {}
  benchmark_id: meta-reference-bfcl
+- dataset_id: ifeval
+  scoring_functions:
+  - basic::ifeval
+  metadata: {}
+  benchmark_id: meta-reference-ifeval
 - dataset_id: docvqa
  scoring_functions:
  - basic::docvqa