From b751f7003dc791a793add93b2722419660f9107d Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Wed, 19 Feb 2025 19:42:04 -0800 Subject: [PATCH] feat: add aggregation_functions to llm_as_judge_405b_simpleqa (#1164) as title, to let scoring function llm_as_judge_405b_simpleqa output aggregated_results. We can leverage categorical_count to calculate the % of correctness as eval benchmark metrics --- .../scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py index a53c5cfa7..074f1ff46 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py @@ -5,7 +5,11 @@ # the root directory of this source tree. from llama_stack.apis.common.type_system import NumberType -from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams, ScoringFn +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + LLMAsJudgeScoringFnParams, + ScoringFn, +) GRADER_TEMPLATE = """ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. @@ -87,5 +91,6 @@ llm_as_judge_405b_simpleqa = ScoringFn( judge_model="meta-llama/Llama-3.1-405B-Instruct", prompt_template=GRADER_TEMPLATE, judge_score_regexes=[r"(A|B|C)"], + aggregation_functions=[AggregationFunctionType.categorical_count.value], ), )