feat: [New Eval Benchamark] IfEval (#1708)

# What does this PR do? In this PR, we added a new eval open benchmark IfEval based on paper https://arxiv.org/abs/2311.07911 to measure the model capability of instruction following. ## Test Plan spin up a llama stack server with open-benchmark template run `llama-stack-client --endpoint xxx eval run-benchmark "meta-reference-ifeval" --model-id "meta-llama/Llama-3.3-70B-Instruct" --output-dir "/home/markchen1015/" --num-examples 20` on client side and get the eval aggregate results
2025-03-19 16:39:59 -07:00 · 2025-03-19 16:39:59 -07:00 · f369871083
commit f369871083
parent a7008dc15d
13 changed files with 3520 additions and 1 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -52,6 +52,7 @@ jobs:
          # always test against the latest version of the client
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
          uv pip install -e .
          llama stack build --template ollama --image-type venv
      - name: Wait for Ollama to start
        run: |
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -7,10 +7,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -23,6 +25,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -41,10 +44,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -56,6 +61,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -75,10 +81,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -91,6 +99,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -112,11 +121,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -128,6 +139,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -147,10 +159,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -164,6 +178,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -184,11 +199,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -201,6 +218,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -219,10 +237,12 @@
    "blobfile",
    "chardet",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "nltk",
@ -235,6 +255,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -253,11 +274,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -270,6 +293,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -288,11 +312,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -305,6 +331,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -325,11 +352,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -343,6 +372,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -365,12 +395,14 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "httpx",
    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -384,6 +416,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -403,10 +436,12 @@
    "aiosqlite",
    "blobfile",
    "chardet",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -418,6 +453,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -436,10 +472,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -453,6 +491,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -470,9 +509,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -486,6 +527,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -505,10 +547,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -521,6 +565,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -540,10 +585,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -556,6 +603,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -605,11 +653,13 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -622,6 +672,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -641,10 +692,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -657,6 +710,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -677,10 +731,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -693,6 +749,7 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6268,6 +6268,7 @@
                "type": "string",
                "enum": [
                    "average",
                    "weighted_average",
                    "median",
                    "categorical_count",
                    "accuracy"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4389,6 +4389,7 @@ components:
      type: string
      enum:
        - average
        - weighted_average
        - median
        - categorical_count
        - accuracy
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum):
@json_schema_type
 class AggregationFunctionType(Enum):
    average = "average"
    weighted_average = "weighted_average"
    median = "median"
    categorical_count = "categorical_count"
    accuracy = "accuracy"
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -25,6 +25,7 @@ from .config import BasicScoringConfig
 from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
 from .scoring_fn.regex_parser_math_response_scoring_fn import (
    RegexParserMathResponseScoringFn,
 )
@ -37,6 +38,7 @@ FIXED_FNS = [
    RegexParserScoringFn,
    RegexParserMathResponseScoringFn,
    BFCLScoringFn,
    IfEvalScoringFn,
    DocVQAScoringFn,
 ]
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -0,0 +1,23 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.common.type_system import NumberType
 from llama_stack.apis.scoring_functions import (
    AggregationFunctionType,
    BasicScoringFnParams,
    ScoringFn,
 )
 ifeval = ScoringFn(
    identifier="basic::ifeval",
    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
    return_type=NumberType(),
    provider_id="basic",
    provider_resource_id="ifeval",
    params=BasicScoringFnParams(
        aggregation_functions=[AggregationFunctionType.weighted_average],
    ),
 )
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -0,0 +1,79 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, Optional
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
 from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
 from .fn_defs.ifeval import (
    ifeval,
 )
 class IfEvalScoringFn(RegisteredBaseScoringFn):
    """
    A scoring_fn Instruction-Following Eval (IFEval) benchmark
    """
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.supported_fn_defs_registry = {
            ifeval.identifier: ifeval,
        }
    async def score_row(
        self,
        input_row: Dict[str, Any],
        scoring_fn_identifier: Optional[str] = None,
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> ScoringResultRow:
        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
        if scoring_params is not None:
            fn_def.params = scoring_params
        instruction_list = input_row["instruction_id_list"]
        generated_answer = input_row["generated_answer"].strip()
        is_following_list = []
        results = dict(
            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
        )
        for index, instruction_id in enumerate(instruction_list):
            instruction_cls = INSTRUCTION_DICT[instruction_id]
            instruction = instruction_cls(instruction_id)
            results[instruction_id + "_total"] += 1.0
            results[instruction_id.split(":")[0] + "_total"] += 1.0
            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
            print(clean_input_row)
            instruction.build_description(**clean_input_row)
            args = instruction.get_instruction_args()
            if args and "prompt" in args:
                instruction.build_description(prompt=input_row["prompt"])
            if generated_answer and instruction.check_following(generated_answer):
                is_following_list.append(True)
                results[instruction_id + "_correct"] += 1.0
                results[instruction_id.split(":")[0] + "_correct"] += 1.0
            else:
                is_following_list.append(False)
        if len(is_following_list) == 0:
            return {
                "score": 0.0,
                "weight": 0.0,
            }
        return {
            "score": float(sum(is_following_list)) / float(len(is_following_list)),
            "weight": float(len(is_following_list)),
        }
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
        InlineProviderSpec(
            api=Api.eval,
            provider_type="inline::meta-reference",
-            pip_packages=["tree_sitter"],
+            pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"],
            module="llama_stack.providers.inline.eval.meta_reference",
            config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
            api_dependencies=[
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
    }
 def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
    return {
        "weighted_average": sum(
            result["score"] * result["weight"]
            for result in scoring_results
            if result["score"] is not None and result["weight"] is not None
        )
        / sum(result["weight"] for result in scoring_results if result["weight"] is not None),
    }
 def aggregate_categorical_count(
    scoring_results: List[ScoringResultRow],
 ) -> Dict[str, Any]:
@ -46,6 +57,7 @@ def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
 AGGREGATION_FUNCTIONS = {
    AggregationFunctionType.accuracy: aggregate_accuracy,
    AggregationFunctionType.average: aggregate_average,
    AggregationFunctionType.weighted_average: aggregate_weighted_average,
    AggregationFunctionType.categorical_count: aggregate_categorical_count,
    AggregationFunctionType.median: aggregate_median,
 }
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -203,6 +203,13 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
            ),
        ),
        DatasetInput(
            dataset_id="ifeval",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/IfEval?split=train",
            ),
        ),
        DatasetInput(
            dataset_id="docvqa",
            purpose=DatasetPurpose.eval_messages_answer,
@ -238,6 +245,11 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="bfcl",
            scoring_functions=["basic::bfcl"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-ifeval",
            dataset_id="ifeval",
            scoring_functions=["basic::ifeval"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-docvqa",
            dataset_id="docvqa",
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -188,6 +188,12 @@ datasets:
    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
  metadata: {}
  dataset_id: bfcl
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/IfEval?split=train
  metadata: {}
  dataset_id: ifeval
 - purpose: eval/messages-answer
  source:
    type: uri
@ -221,6 +227,11 @@ benchmarks:
  - basic::bfcl
  metadata: {}
  benchmark_id: meta-reference-bfcl
 - dataset_id: ifeval
  scoring_functions:
  - basic::ifeval
  metadata: {}
  benchmark_id: meta-reference-ifeval
 - dataset_id: docvqa
  scoring_functions:
  - basic::docvqa