feat: [New Eval Benchamark] IfEval (#1708)

# What does this PR do?
In this PR, we added a new eval open benchmark IfEval based on paper
https://arxiv.org/abs/2311.07911 to measure the model capability of
instruction following.


## Test Plan
spin up a llama stack server with open-benchmark template

run `llama-stack-client --endpoint xxx eval run-benchmark
"meta-reference-ifeval" --model-id "meta-llama/Llama-3.3-70B-Instruct"
--output-dir "/home/markchen1015/" --num-examples 20` on client side and
get the eval aggregate results
This commit is contained in:
Botao Chen 2025-03-19 16:39:59 -07:00 committed by GitHub
parent a7008dc15d
commit f369871083
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 3520 additions and 1 deletions

View file

@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
}
def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
return {
"weighted_average": sum(
result["score"] * result["weight"]
for result in scoring_results
if result["score"] is not None and result["weight"] is not None
)
/ sum(result["weight"] for result in scoring_results if result["weight"] is not None),
}
def aggregate_categorical_count(
scoring_results: List[ScoringResultRow],
) -> Dict[str, Any]:
@ -46,6 +57,7 @@ def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
AGGREGATION_FUNCTIONS = {
AggregationFunctionType.accuracy: aggregate_accuracy,
AggregationFunctionType.average: aggregate_average,
AggregationFunctionType.weighted_average: aggregate_weighted_average,
AggregationFunctionType.categorical_count: aggregate_categorical_count,
AggregationFunctionType.median: aggregate_median,
}