From b4416b72fd4e7728e53d38069d810a7c6487322c Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 11 Nov 2024 17:35:40 -0500
Subject: [PATCH] Folder restructure for evals/datasets/scoring (#419)

* rename evals related stuff

* fix datasetio

* fix scoring test

* localfs -> LocalFS

* refactor scoring

* refactor scoring

* remove 8b_correctness scoring_fn from tests

* tests w/ eval params

* scoring fn braintrust fixture

* import
---
 .../localfs}/__init__.py                      |  8 +--
 .../datasetio => datasetio/localfs}/config.py |  2 +-
 .../localfs}/datasetio.py                     |  6 +-
 .../eval => eval/meta_reference}/__init__.py  |  0
 .../eval => eval/meta_reference}/config.py    |  0
 .../eval => eval/meta_reference}/eval.py      |  3 +-
 .../braintrust}/__init__.py                   |  0
 .../braintrust}/braintrust.py                 |  5 +-
 .../scoring => scoring/braintrust}/config.py  |  0
 .../braintrust}/scoring_fn/__init__.py        |  0
 .../scoring_fn/fn_defs/__init__.py            |  0
 .../scoring_fn/fn_defs/answer_correctness.py  |  0
 .../scoring_fn/fn_defs/factuality.py          |  0
 .../meta_reference}/__init__.py               |  0
 .../meta_reference}/config.py                 |  0
 .../meta_reference}/scoring.py                |  0
 .../meta_reference}/scoring_fn/__init__.py    |  0
 .../scoring_fn/base_scoring_fn.py             |  0
 .../scoring_fn/equality_scoring_fn.py         | 12 +---
 .../scoring_fn/fn_defs/__init__.py            |  0
 .../scoring_fn/fn_defs/equality.py            |  0
 .../scoring_fn/fn_defs/llm_as_judge_base.py   | 15 +++++
 .../regex_parser_multiple_choice_answer.py    |  0
 .../scoring_fn/fn_defs/subset_of.py           |  0
 .../scoring_fn/llm_as_judge_scoring_fn.py     | 16 ++----
 .../scoring_fn/regex_parser_scoring_fn.py     |  2 +-
 .../scoring_fn/subset_of_scoring_fn.py        | 12 +---
 llama_stack/providers/registry/datasetio.py   |  6 +-
 llama_stack/providers/registry/eval.py        |  4 +-
 llama_stack/providers/registry/scoring.py     |  8 +--
 .../providers/tests/datasetio/fixtures.py     |  8 +--
 .../eval/constants.py}                        | 19 -------
 llama_stack/providers/tests/eval/test_eval.py | 20 +++++--
 .../providers/tests/scoring/conftest.py       | 14 ++++-
 .../providers/tests/scoring/fixtures.py       | 22 +++++---
 .../providers/tests/scoring/test_scoring.py   | 56 ++++++++++++++++---
 .../scoring/aggregation_utils.py}             |  3 -
 37 files changed, 141 insertions(+), 100 deletions(-)
 rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/__init__.py (60%)
 rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/config.py (83%)
 rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/datasetio.py (95%)
 rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/__init__.py (100%)
 rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/config.py (100%)
 rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/eval.py (99%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/__init__.py (100%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/braintrust.py (98%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/config.py (100%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/__init__.py (100%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/__init__.py (100%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/answer_correctness.py (100%)
 rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/factuality.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/__init__.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/config.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/__init__.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/base_scoring_fn.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/equality_scoring_fn.py (82%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/__init__.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/equality.py (100%)
 create mode 100644 llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/subset_of.py (100%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/llm_as_judge_scoring_fn.py (86%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/regex_parser_scoring_fn.py (96%)
 rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/subset_of_scoring_fn.py (80%)
 rename llama_stack/providers/{inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py => tests/eval/constants.py} (60%)
 rename llama_stack/providers/{inline/meta_reference/scoring/scoring_fn/common.py => utils/scoring/aggregation_utils.py} (92%)

diff --git a/llama_stack/providers/inline/meta_reference/datasetio/__init__.py b/llama_stack/providers/inline/datasetio/localfs/__init__.py
similarity index 60%
rename from llama_stack/providers/inline/meta_reference/datasetio/__init__.py
rename to llama_stack/providers/inline/datasetio/localfs/__init__.py
index 9a65f5c3e..db8aa555c 100644
--- a/llama_stack/providers/inline/meta_reference/datasetio/__init__.py
+++ b/llama_stack/providers/inline/datasetio/localfs/__init__.py
@@ -4,15 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .config import MetaReferenceDatasetIOConfig
+from .config import LocalFSDatasetIOConfig
 
 
 async def get_provider_impl(
-    config: MetaReferenceDatasetIOConfig,
+    config: LocalFSDatasetIOConfig,
     _deps,
 ):
-    from .datasetio import MetaReferenceDatasetIOImpl
+    from .datasetio import LocalFSDatasetIOImpl
 
-    impl = MetaReferenceDatasetIOImpl(config)
+    impl = LocalFSDatasetIOImpl(config)
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/inline/meta_reference/datasetio/config.py b/llama_stack/providers/inline/datasetio/localfs/config.py
similarity index 83%
rename from llama_stack/providers/inline/meta_reference/datasetio/config.py
rename to llama_stack/providers/inline/datasetio/localfs/config.py
index e667e3252..58d563c99 100644
--- a/llama_stack/providers/inline/meta_reference/datasetio/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@@ -6,4 +6,4 @@
 from llama_stack.apis.datasetio import *  # noqa: F401, F403
 
 
-class MetaReferenceDatasetIOConfig(BaseModel): ...
+class LocalFSDatasetIOConfig(BaseModel): ...
diff --git a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
similarity index 95%
rename from llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
rename to llama_stack/providers/inline/datasetio/localfs/datasetio.py
index a6fe4feb3..d8c100684 100644
--- a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@@ -15,7 +15,7 @@ from dataclasses import dataclass
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url
 
-from .config import MetaReferenceDatasetIOConfig
+from .config import LocalFSDatasetIOConfig
 
 
 class BaseDataset(ABC):
@@ -77,8 +77,8 @@ class PandasDataframeDataset(BaseDataset):
         self.df = self._validate_dataset_schema(df)
 
 
-class MetaReferenceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
-    def __init__(self, config: MetaReferenceDatasetIOConfig) -> None:
+class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
+    def __init__(self, config: LocalFSDatasetIOConfig) -> None:
         self.config = config
         # local registry for keeping track of datasets within the provider
         self.dataset_infos = {}
diff --git a/llama_stack/providers/inline/meta_reference/eval/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/eval/__init__.py
rename to llama_stack/providers/inline/eval/meta_reference/__init__.py
diff --git a/llama_stack/providers/inline/meta_reference/eval/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/eval/config.py
rename to llama_stack/providers/inline/eval/meta_reference/config.py
diff --git a/llama_stack/providers/inline/meta_reference/eval/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
similarity index 99%
rename from llama_stack/providers/inline/meta_reference/eval/eval.py
rename to llama_stack/providers/inline/eval/meta_reference/eval.py
index 48d8e2b04..df642f33b 100644
--- a/llama_stack/providers/inline/meta_reference/eval/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -9,14 +9,13 @@ from llama_models.llama3.api.datatypes import *  # noqa: F403
 from .....apis.common.job_types import Job
 from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from tqdm import tqdm
-
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval_tasks import EvalTaskDef
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.scoring import Scoring
 from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from tqdm import tqdm
 
 from .config import MetaReferenceEvalConfig
 
diff --git a/llama_stack/providers/inline/braintrust/scoring/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/__init__.py
rename to llama_stack/providers/inline/scoring/braintrust/__init__.py
diff --git a/llama_stack/providers/inline/braintrust/scoring/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
similarity index 98%
rename from llama_stack/providers/inline/braintrust/scoring/braintrust.py
rename to llama_stack/providers/inline/scoring/braintrust/braintrust.py
index 6488a63eb..57723bb47 100644
--- a/llama_stack/providers/inline/braintrust/scoring/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@@ -16,9 +16,8 @@ from llama_stack.apis.datasets import *  # noqa: F403
 from autoevals.llm import Factuality
 from autoevals.ragas import AnswerCorrectness
 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_average,
-)
+
+from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average
 
 from .config import BraintrustScoringConfig
 from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
diff --git a/llama_stack/providers/inline/braintrust/scoring/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/config.py
rename to llama_stack/providers/inline/scoring/braintrust/config.py
diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/__init__.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/__init__.py
rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/__init__.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/__init__.py
rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/answer_correctness.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/answer_correctness.py
rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/factuality.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
similarity index 100%
rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/factuality.py
rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/__init__.py
rename to llama_stack/providers/inline/scoring/meta_reference/__init__.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/config.py b/llama_stack/providers/inline/scoring/meta_reference/config.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/config.py
rename to llama_stack/providers/inline/scoring/meta_reference/config.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring.py b/llama_stack/providers/inline/scoring/meta_reference/scoring.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/__init__.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/__init__.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py
similarity index 82%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py
index 07405d56c..877b64e4e 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py
@@ -4,20 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_accuracy,
-)
+from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.equality import (
-    equality,
-)
+from .fn_defs.equality import equality
 
 
 class EqualityScoringFn(BaseScoringFn):
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/__init__.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/__init__.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/__init__.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py
diff --git a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py
new file mode 100644
index 000000000..69d96e1bf
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import ScoringFnDef
+
+
+llm_as_judge_base = ScoringFnDef(
+    identifier="meta-reference::llm_as_judge_base",
+    description="Llm As Judge Scoring Function",
+    return_type=NumberType(),
+)
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py
similarity index 100%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py
similarity index 86%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py
index f98f7fb5e..e1f19e640 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py
@@ -4,20 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.inference.inference import Inference
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
 import re
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_average,
-)
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.llm_as_judge_8b_correctness import (
-    llm_as_judge_8b_correctness,
-)
+from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average
+
+from .fn_defs.llm_as_judge_base import llm_as_judge_base
 
 
 class LlmAsJudgeScoringFn(BaseScoringFn):
@@ -29,7 +25,7 @@ class LlmAsJudgeScoringFn(BaseScoringFn):
         super().__init__(*arg, **kwargs)
         self.inference_api = inference_api
         self.supported_fn_defs_registry = {
-            llm_as_judge_8b_correctness.identifier: llm_as_judge_8b_correctness,
+            llm_as_judge_base.identifier: llm_as_judge_base,
         }
 
     async def score_row(
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py
similarity index 96%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py
index 0aff2f535..3cbc6cbe4 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py
@@ -9,7 +9,7 @@ from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from .common import aggregate_accuracy
+from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
 
 from .fn_defs.regex_parser_multiple_choice_answer import (
     regex_parser_multiple_choice_answer,
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py
similarity index 80%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py
index 289c63dd7..fe5988160 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py
@@ -4,19 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_accuracy,
-)
+from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
 
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.subset_of import (
-    subset_of,
-)
+from .fn_defs.subset_of import subset_of
 
 
 class SubsetOfScoringFn(BaseScoringFn):
diff --git a/llama_stack/providers/registry/datasetio.py b/llama_stack/providers/registry/datasetio.py
index 3fdeac997..2d1c722f0 100644
--- a/llama_stack/providers/registry/datasetio.py
+++ b/llama_stack/providers/registry/datasetio.py
@@ -13,10 +13,10 @@ def available_providers() -> List[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.datasetio,
-            provider_type="meta-reference",
+            provider_type="localfs",
             pip_packages=["pandas"],
-            module="llama_stack.providers.inline.meta_reference.datasetio",
-            config_class="llama_stack.providers.inline.meta_reference.datasetio.MetaReferenceDatasetIOConfig",
+            module="llama_stack.providers.inline.datasetio.localfs",
+            config_class="llama_stack.providers.inline.datasetio.localfs.LocalFSDatasetIOConfig",
             api_dependencies=[],
         ),
         remote_provider_spec(
diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py
index 9b9ba6409..275cc92db 100644
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]:
             api=Api.eval,
             provider_type="meta-reference",
             pip_packages=[],
-            module="llama_stack.providers.inline.meta_reference.eval",
-            config_class="llama_stack.providers.inline.meta_reference.eval.MetaReferenceEvalConfig",
+            module="llama_stack.providers.inline.eval.meta_reference",
+            config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
             api_dependencies=[
                 Api.datasetio,
                 Api.datasets,
diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py
index 2586083f6..70f43ad73 100644
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]:
             api=Api.scoring,
             provider_type="meta-reference",
             pip_packages=[],
-            module="llama_stack.providers.inline.meta_reference.scoring",
-            config_class="llama_stack.providers.inline.meta_reference.scoring.MetaReferenceScoringConfig",
+            module="llama_stack.providers.inline.scoring.meta_reference",
+            config_class="llama_stack.providers.inline.scoring.meta_reference.MetaReferenceScoringConfig",
             api_dependencies=[
                 Api.datasetio,
                 Api.datasets,
@@ -27,8 +27,8 @@ def available_providers() -> List[ProviderSpec]:
             api=Api.scoring,
             provider_type="braintrust",
             pip_packages=["autoevals", "openai"],
-            module="llama_stack.providers.inline.braintrust.scoring",
-            config_class="llama_stack.providers.inline.braintrust.scoring.BraintrustScoringConfig",
+            module="llama_stack.providers.inline.scoring.braintrust",
+            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
             api_dependencies=[
                 Api.datasetio,
                 Api.datasets,
diff --git a/llama_stack/providers/tests/datasetio/fixtures.py b/llama_stack/providers/tests/datasetio/fixtures.py
index d810d5e02..6f20bf96a 100644
--- a/llama_stack/providers/tests/datasetio/fixtures.py
+++ b/llama_stack/providers/tests/datasetio/fixtures.py
@@ -19,12 +19,12 @@ def datasetio_remote() -> ProviderFixture:
 
 
 @pytest.fixture(scope="session")
-def datasetio_meta_reference() -> ProviderFixture:
+def datasetio_localfs() -> ProviderFixture:
     return ProviderFixture(
         providers=[
             Provider(
-                provider_id="meta-reference",
-                provider_type="meta-reference",
+                provider_id="localfs",
+                provider_type="localfs",
                 config={},
             )
         ],
@@ -44,7 +44,7 @@ def datasetio_huggingface() -> ProviderFixture:
     )
 
 
-DATASETIO_FIXTURES = ["meta_reference", "remote", "huggingface"]
+DATASETIO_FIXTURES = ["localfs", "remote", "huggingface"]
 
 
 @pytest_asyncio.fixture(scope="session")
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py b/llama_stack/providers/tests/eval/constants.py
similarity index 60%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
rename to llama_stack/providers/tests/eval/constants.py
index 68d77b8df..0fb1a44c4 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
+++ b/llama_stack/providers/tests/eval/constants.py
@@ -4,10 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
-from llama_stack.apis.scoring import *  # noqa: F401, F403
-from llama_stack.apis.common.type_system import NumberType
-
 JUDGE_PROMPT = """
 You will be given a question, a expected_answer, and a system_answer.
 Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
@@ -22,18 +18,3 @@ System Answer: {generated_answer}
 Feedback:::
 Total rating:
 """
-
-llm_as_judge_8b_correctness = ScoringFnDef(
-    identifier="meta-reference::llm_as_judge_8b_correctness",
-    description="Llm As Judge Scoring Function",
-    return_type=NumberType(),
-    params=LLMAsJudgeScoringFnParams(
-        prompt_template=JUDGE_PROMPT,
-        judge_model="Llama3.1-8B-Instruct",
-        judge_score_regexes=[
-            r"Total rating: (\d+)",
-            r"rating: (\d+)",
-            r"Rating: (\d+)",
-        ],
-    ),
-)
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index fdd4dcfbb..9f14c61ef 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -19,9 +19,10 @@ from llama_stack.apis.eval.eval import (
     EvalTaskDefWithProvider,
     ModelCandidate,
 )
+from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
-
+from .constants import JUDGE_PROMPT
 
 # How to run this test:
 #
@@ -65,7 +66,7 @@ class Testeval:
         assert len(rows.rows) == 3
 
         scoring_functions = [
-            "meta-reference::llm_as_judge_8b_correctness",
+            "meta-reference::llm_as_judge_base",
             "meta-reference::equality",
         ]
         task_id = "meta-reference::app_eval"
@@ -85,11 +86,22 @@ class Testeval:
                     model="Llama3.2-3B-Instruct",
                     sampling_params=SamplingParams(),
                 ),
+                scoring_params={
+                    "meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams(
+                        judge_model="Llama3.1-8B-Instruct",
+                        prompt_template=JUDGE_PROMPT,
+                        judge_score_regexes=[
+                            r"Total rating: (\d+)",
+                            r"rating: (\d+)",
+                            r"Rating: (\d+)",
+                        ],
+                    )
+                },
             ),
         )
         assert len(response.generations) == 3
-        assert "meta-reference::llm_as_judge_8b_correctness" in response.scores
         assert "meta-reference::equality" in response.scores
+        assert "meta-reference::llm_as_judge_base" in response.scores
 
     @pytest.mark.asyncio
     async def test_eval_run_eval(self, eval_stack):
@@ -109,7 +121,6 @@ class Testeval:
         )
 
         scoring_functions = [
-            "meta-reference::llm_as_judge_8b_correctness",
             "meta-reference::subset_of",
         ]
 
@@ -138,7 +149,6 @@ class Testeval:
         assert eval_response is not None
         assert len(eval_response.generations) == 5
         assert "meta-reference::subset_of" in eval_response.scores
-        assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores
 
     @pytest.mark.asyncio
     async def test_eval_run_benchmark_eval(self, eval_stack):
diff --git a/llama_stack/providers/tests/scoring/conftest.py b/llama_stack/providers/tests/scoring/conftest.py
index ee578f9b3..ed56df230 100644
--- a/llama_stack/providers/tests/scoring/conftest.py
+++ b/llama_stack/providers/tests/scoring/conftest.py
@@ -16,7 +16,7 @@ DEFAULT_PROVIDER_COMBINATIONS = [
     pytest.param(
         {
             "scoring": "meta_reference",
-            "datasetio": "meta_reference",
+            "datasetio": "localfs",
             "inference": "fireworks",
         },
         id="meta_reference_scoring_fireworks_inference",
@@ -25,12 +25,21 @@ DEFAULT_PROVIDER_COMBINATIONS = [
     pytest.param(
         {
             "scoring": "meta_reference",
-            "datasetio": "meta_reference",
+            "datasetio": "localfs",
             "inference": "together",
         },
         id="meta_reference_scoring_together_inference",
         marks=pytest.mark.meta_reference_scoring_together_inference,
     ),
+    pytest.param(
+        {
+            "scoring": "braintrust",
+            "datasetio": "localfs",
+            "inference": "together",
+        },
+        id="braintrust_scoring_together_inference",
+        marks=pytest.mark.braintrust_scoring_together_inference,
+    ),
 ]
 
 
@@ -38,6 +47,7 @@ def pytest_configure(config):
     for fixture_name in [
         "meta_reference_scoring_fireworks_inference",
         "meta_reference_scoring_together_inference",
+        "braintrust_scoring_together_inference",
     ]:
         config.addinivalue_line(
             "markers",
diff --git a/llama_stack/providers/tests/scoring/fixtures.py b/llama_stack/providers/tests/scoring/fixtures.py
index 925f98779..648d35859 100644
--- a/llama_stack/providers/tests/scoring/fixtures.py
+++ b/llama_stack/providers/tests/scoring/fixtures.py
@@ -31,7 +31,20 @@ def scoring_meta_reference() -> ProviderFixture:
     )
 
 
-SCORING_FIXTURES = ["meta_reference", "remote"]
+@pytest.fixture(scope="session")
+def scoring_braintrust() -> ProviderFixture:
+    return ProviderFixture(
+        providers=[
+            Provider(
+                provider_id="braintrust",
+                provider_type="braintrust",
+                config={},
+            )
+        ],
+    )
+
+
+SCORING_FIXTURES = ["meta_reference", "remote", "braintrust"]
 
 
 @pytest_asyncio.fixture(scope="session")
@@ -52,9 +65,4 @@ async def scoring_stack(request):
         provider_data,
     )
 
-    return (
-        impls[Api.scoring],
-        impls[Api.scoring_functions],
-        impls[Api.datasetio],
-        impls[Api.datasets],
-    )
+    return impls
diff --git a/llama_stack/providers/tests/scoring/test_scoring.py b/llama_stack/providers/tests/scoring/test_scoring.py
index 3c1b6554f..f3c925048 100644
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ b/llama_stack/providers/tests/scoring/test_scoring.py
@@ -8,7 +8,7 @@
 import pytest
 
 from llama_stack.apis.scoring_functions import *  # noqa: F403
-
+from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
 
 # How to run this test:
@@ -23,20 +23,36 @@ class TestScoring:
     async def test_scoring_functions_list(self, scoring_stack):
         # NOTE: this needs you to ensure that you are starting from a clean state
         # but so far we don't have an unregister API unfortunately, so be careful
-        _, scoring_functions_impl, _, _ = scoring_stack
+        scoring_functions_impl = scoring_stack[Api.scoring_functions]
         response = await scoring_functions_impl.list_scoring_functions()
         assert isinstance(response, list)
         assert len(response) > 0
 
     @pytest.mark.asyncio
     async def test_scoring_score(self, scoring_stack):
-        scoring_impl, scoring_functions_impl, datasetio_impl, datasets_impl = (
-            scoring_stack
+        (
+            scoring_impl,
+            scoring_functions_impl,
+            datasetio_impl,
+            datasets_impl,
+            models_impl,
+        ) = (
+            scoring_stack[Api.scoring],
+            scoring_stack[Api.scoring_functions],
+            scoring_stack[Api.datasetio],
+            scoring_stack[Api.datasets],
+            scoring_stack[Api.models],
         )
         await register_dataset(datasets_impl)
         response = await datasets_impl.list_datasets()
         assert len(response) == 1
 
+        for model_id in ["Llama3.2-3B-Instruct", "Llama3.1-8B-Instruct"]:
+            await models_impl.register_model(
+                model_id=model_id,
+                provider_id="",
+            )
+
         # scoring individual rows
         rows = await datasetio_impl.get_rows_paginated(
             dataset_id="test_dataset",
@@ -44,10 +60,11 @@ class TestScoring:
         )
         assert len(rows.rows) == 3
 
+        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
         scoring_functions = {
-            "meta-reference::llm_as_judge_8b_correctness": None,
-            "meta-reference::equality": None,
+            scoring_fns_list[0].identifier: None,
         }
+
         response = await scoring_impl.score(
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
@@ -69,13 +86,34 @@ class TestScoring:
 
     @pytest.mark.asyncio
     async def test_scoring_score_with_params(self, scoring_stack):
-        scoring_impl, scoring_functions_impl, datasetio_impl, datasets_impl = (
-            scoring_stack
+        (
+            scoring_impl,
+            scoring_functions_impl,
+            datasetio_impl,
+            datasets_impl,
+            models_impl,
+        ) = (
+            scoring_stack[Api.scoring],
+            scoring_stack[Api.scoring_functions],
+            scoring_stack[Api.datasetio],
+            scoring_stack[Api.datasets],
+            scoring_stack[Api.models],
         )
         await register_dataset(datasets_impl)
         response = await datasets_impl.list_datasets()
         assert len(response) == 1
 
+        for model_id in ["Llama3.1-405B-Instruct"]:
+            await models_impl.register_model(
+                model_id=model_id,
+                provider_id="",
+            )
+
+        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
+        provider_id = scoring_fns_list[0].provider_id
+        if provider_id == "braintrust":
+            pytest.skip("Braintrust provider does not support scoring with params")
+
         # scoring individual rows
         rows = await datasetio_impl.get_rows_paginated(
             dataset_id="test_dataset",
@@ -84,7 +122,7 @@ class TestScoring:
         assert len(rows.rows) == 3
 
         scoring_functions = {
-            "meta-reference::llm_as_judge_8b_correctness": LLMAsJudgeScoringFnParams(
+            "meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams(
                 judge_model="Llama3.1-405B-Instruct",
                 prompt_template="Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9.",
                 judge_score_regexes=[r"Score: (\d+)"],
diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py b/llama_stack/providers/utils/scoring/aggregation_utils.py
similarity index 92%
rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py
rename to llama_stack/providers/utils/scoring/aggregation_utils.py
index 25bac5edc..1ca0c7fb3 100644
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@@ -3,13 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from pathlib import Path
 from typing import Any, Dict, List
 
 from llama_stack.apis.scoring import ScoringResultRow
 
-FN_DEFS_PATH = Path(__file__).parent / "fn_defs"
-
 
 def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
     num_correct = sum(result["score"] for result in scoring_results)