forked from phoenix-oss/llama-stack-mirror
feat: add aggregation_functions to llm_as_judge_405b_simpleqa (#1164)
as title, to let scoring function llm_as_judge_405b_simpleqa output aggregated_results. We can leverage categorical_count to calculate the % of correctness as eval benchmark metrics
This commit is contained in:
parent
c1f7d7f005
commit
b751f7003d
1 changed files with 6 additions and 1 deletions
|
@ -5,7 +5,11 @@
|
|||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.common.type_system import NumberType
|
||||
from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams, ScoringFn
|
||||
from llama_stack.apis.scoring_functions import (
|
||||
AggregationFunctionType,
|
||||
LLMAsJudgeScoringFnParams,
|
||||
ScoringFn,
|
||||
)
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
||||
|
@ -87,5 +91,6 @@ llm_as_judge_405b_simpleqa = ScoringFn(
|
|||
judge_model="meta-llama/Llama-3.1-405B-Instruct",
|
||||
prompt_template=GRADER_TEMPLATE,
|
||||
judge_score_regexes=[r"(A|B|C)"],
|
||||
aggregation_functions=[AggregationFunctionType.categorical_count.value],
|
||||
),
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue