mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-11 19:56:03 +00:00
registry refactor
This commit is contained in:
parent
c50686b6fe
commit
95fd53d292
8 changed files with 39 additions and 71 deletions
|
|
@ -71,6 +71,10 @@ class MetaReferenceEvalsImpl(Evals):
|
|||
dataset_config: EvaluateDatasetConfig,
|
||||
eval_scoring_config: EvaluateScoringConfig,
|
||||
) -> EvaluateResponse:
|
||||
cprint("run_scorer")
|
||||
|
||||
# main logic, we need to convert the datset into List[ScorerInputSample]
|
||||
|
||||
return EvaluateResponse(
|
||||
eval_result={},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -153,35 +153,9 @@ class MMLUProcessor(
|
|||
break
|
||||
|
||||
return ScorerInputSample(
|
||||
generated_answer=extracted_answer,
|
||||
expected_answer=dataset_sample.data["Answer"],
|
||||
generation_output=PostprocessedGeneration(
|
||||
completion_message=response_text,
|
||||
transformed_generation=extracted_answer,
|
||||
),
|
||||
expected_output=dataset_sample.data["Answer"],
|
||||
)
|
||||
|
||||
# def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
|
||||
# postprocessed_output = sample.postprocessed["postprocessed"]
|
||||
# expected_answer = sample.data["Answer"]
|
||||
|
||||
# extracted_answer = None
|
||||
# for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
|
||||
# regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
|
||||
# match = re.search(regex, postprocessed_output)
|
||||
# if match:
|
||||
# extracted_answer = normalize_extracted_answer(match.group(1))
|
||||
# break
|
||||
|
||||
# score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
|
||||
|
||||
# return SingleEvalResult(
|
||||
# score_data={
|
||||
# "score": score,
|
||||
# },
|
||||
# )
|
||||
|
||||
# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
# print("aggregate_results", eval_results)
|
||||
# sum_score = sum([result.score_data["score"] for result in eval_results])
|
||||
|
||||
# return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
|
||||
|
|
|
|||
|
|
@ -28,8 +28,8 @@ class RandomScorer(BaseScorer[ScorerInputSample]):
|
|||
|
||||
class AccuracyScorer(BaseScorer[ScorerInputSample]):
|
||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||
extracted_answer = scorer_input_sample.generation_output.transformed_generation
|
||||
expected_answer = scorer_input_sample.expected_output
|
||||
extracted_answer = scorer_input_sample.generated_answer
|
||||
expected_answer = scorer_input_sample.expected_answer
|
||||
|
||||
accuracy = (
|
||||
1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue