From b1ebc837f80c527d679e70eb4234a54881b28beb Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 11 Nov 2024 15:49:18 -0500 Subject: [PATCH] refactor scoring --- .../scoring_fn/fn_defs/llm_as_judge_base.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py index f7de54f46..171e09def 100644 --- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py @@ -8,32 +8,9 @@ from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import NumberType -# JUDGE_PROMPT = """ -# You will be given a question, a expected_answer, and a system_answer. -# Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question. -# Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question. -# Provide your feedback as follows: -# Feedback::: -# Total rating: (your rating, as a int between 0 and 5) -# Now here are the question, expected_answer, system_answer. -# Question: {input_query} -# Expected Answer: {expected_answer} -# System Answer: {generated_answer} -# Feedback::: -# Total rating: -# """ llm_as_judge_base = ScoringFnDef( identifier="meta-reference::llm_as_judge_base", description="Llm As Judge Scoring Function", return_type=NumberType(), - # params=LLMAsJudgeScoringFnParams( - # prompt_template=JUDGE_PROMPT, - # judge_model="Llama3.1-8B-Instruct", - # judge_score_regexes=[ - # r"Total rating: (\d+)", - # r"rating: (\d+)", - # r"Rating: (\d+)", - # ], - # ), )