Folder restructure for evals/datasets/scoring (#419)

* rename evals related stuff * fix datasetio * fix scoring test * localfs -> LocalFS * refactor scoring * refactor scoring * remove 8b_correctness scoring_fn from tests * tests w/ eval params * scoring fn braintrust fixture * import
2024-11-11 17:35:40 -05:00 · 2024-11-11 17:35:40 -05:00 · b4416b72fd
commit b4416b72fd
parent 2b7d70ba86
37 changed files with 141 additions and 100 deletions
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List
+
+from llama_stack.apis.scoring import ScoringResultRow
+
+
+def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+    num_correct = sum(result["score"] for result in scoring_results)
+    avg_score = num_correct / len(scoring_results)
+
+    return {
+        "accuracy": avg_score,
+        "num_correct": num_correct,
+        "num_total": len(scoring_results),
+    }
+
+
+def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+    return {
+        "average": sum(
+            result["score"] for result in scoring_results if result["score"] is not None
+        )
+        / len([_ for _ in scoring_results if _["score"] is not None]),
+    }