Folder restructure for evals/datasets/scoring (#419)

* rename evals related stuff

* fix datasetio

* fix scoring test

* localfs -> LocalFS

* refactor scoring

* refactor scoring

* remove 8b_correctness scoring_fn from tests

* tests w/ eval params

* scoring fn braintrust fixture

* import
This commit is contained in:
Xi Yan 2024-11-11 17:35:40 -05:00 committed by GitHub
parent 2b7d70ba86
commit b4416b72fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 141 additions and 100 deletions

View file

@ -0,0 +1,28 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List
from llama_stack.apis.scoring import ScoringResultRow
def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
num_correct = sum(result["score"] for result in scoring_results)
avg_score = num_correct / len(scoring_results)
return {
"accuracy": avg_score,
"num_correct": num_correct,
"num_total": len(scoring_results),
}
def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
return {
"average": sum(
result["score"] for result in scoring_results if result["score"] is not None
)
/ len([_ for _ in scoring_results if _["score"] is not None]),
}