From b4416b72fd4e7728e53d38069d810a7c6487322c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 11 Nov 2024 17:35:40 -0500 Subject: [PATCH] Folder restructure for evals/datasets/scoring (#419) * rename evals related stuff * fix datasetio * fix scoring test * localfs -> LocalFS * refactor scoring * refactor scoring * remove 8b_correctness scoring_fn from tests * tests w/ eval params * scoring fn braintrust fixture * import --- .../localfs}/__init__.py | 8 +-- .../datasetio => datasetio/localfs}/config.py | 2 +- .../localfs}/datasetio.py | 6 +- .../eval => eval/meta_reference}/__init__.py | 0 .../eval => eval/meta_reference}/config.py | 0 .../eval => eval/meta_reference}/eval.py | 3 +- .../braintrust}/__init__.py | 0 .../braintrust}/braintrust.py | 5 +- .../scoring => scoring/braintrust}/config.py | 0 .../braintrust}/scoring_fn/__init__.py | 0 .../scoring_fn/fn_defs/__init__.py | 0 .../scoring_fn/fn_defs/answer_correctness.py | 0 .../scoring_fn/fn_defs/factuality.py | 0 .../meta_reference}/__init__.py | 0 .../meta_reference}/config.py | 0 .../meta_reference}/scoring.py | 0 .../meta_reference}/scoring_fn/__init__.py | 0 .../scoring_fn/base_scoring_fn.py | 0 .../scoring_fn/equality_scoring_fn.py | 12 +--- .../scoring_fn/fn_defs/__init__.py | 0 .../scoring_fn/fn_defs/equality.py | 0 .../scoring_fn/fn_defs/llm_as_judge_base.py | 15 +++++ .../regex_parser_multiple_choice_answer.py | 0 .../scoring_fn/fn_defs/subset_of.py | 0 .../scoring_fn/llm_as_judge_scoring_fn.py | 16 ++---- .../scoring_fn/regex_parser_scoring_fn.py | 2 +- .../scoring_fn/subset_of_scoring_fn.py | 12 +--- llama_stack/providers/registry/datasetio.py | 6 +- llama_stack/providers/registry/eval.py | 4 +- llama_stack/providers/registry/scoring.py | 8 +-- .../providers/tests/datasetio/fixtures.py | 8 +-- .../eval/constants.py} | 19 ------- llama_stack/providers/tests/eval/test_eval.py | 20 +++++-- .../providers/tests/scoring/conftest.py | 14 ++++- .../providers/tests/scoring/fixtures.py | 22 +++++--- .../providers/tests/scoring/test_scoring.py | 56 ++++++++++++++++--- .../scoring/aggregation_utils.py} | 3 - 37 files changed, 141 insertions(+), 100 deletions(-) rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/__init__.py (60%) rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/config.py (83%) rename llama_stack/providers/inline/{meta_reference/datasetio => datasetio/localfs}/datasetio.py (95%) rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/__init__.py (100%) rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/config.py (100%) rename llama_stack/providers/inline/{meta_reference/eval => eval/meta_reference}/eval.py (99%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/__init__.py (100%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/braintrust.py (98%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/config.py (100%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/__init__.py (100%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/__init__.py (100%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/answer_correctness.py (100%) rename llama_stack/providers/inline/{braintrust/scoring => scoring/braintrust}/scoring_fn/fn_defs/factuality.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/__init__.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/config.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/__init__.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/base_scoring_fn.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/equality_scoring_fn.py (82%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/__init__.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/equality.py (100%) create mode 100644 llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/fn_defs/subset_of.py (100%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/llm_as_judge_scoring_fn.py (86%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/regex_parser_scoring_fn.py (96%) rename llama_stack/providers/inline/{meta_reference/scoring => scoring/meta_reference}/scoring_fn/subset_of_scoring_fn.py (80%) rename llama_stack/providers/{inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py => tests/eval/constants.py} (60%) rename llama_stack/providers/{inline/meta_reference/scoring/scoring_fn/common.py => utils/scoring/aggregation_utils.py} (92%) diff --git a/llama_stack/providers/inline/meta_reference/datasetio/__init__.py b/llama_stack/providers/inline/datasetio/localfs/__init__.py similarity index 60% rename from llama_stack/providers/inline/meta_reference/datasetio/__init__.py rename to llama_stack/providers/inline/datasetio/localfs/__init__.py index 9a65f5c3e..db8aa555c 100644 --- a/llama_stack/providers/inline/meta_reference/datasetio/__init__.py +++ b/llama_stack/providers/inline/datasetio/localfs/__init__.py @@ -4,15 +4,15 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .config import MetaReferenceDatasetIOConfig +from .config import LocalFSDatasetIOConfig async def get_provider_impl( - config: MetaReferenceDatasetIOConfig, + config: LocalFSDatasetIOConfig, _deps, ): - from .datasetio import MetaReferenceDatasetIOImpl + from .datasetio import LocalFSDatasetIOImpl - impl = MetaReferenceDatasetIOImpl(config) + impl = LocalFSDatasetIOImpl(config) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/meta_reference/datasetio/config.py b/llama_stack/providers/inline/datasetio/localfs/config.py similarity index 83% rename from llama_stack/providers/inline/meta_reference/datasetio/config.py rename to llama_stack/providers/inline/datasetio/localfs/config.py index e667e3252..58d563c99 100644 --- a/llama_stack/providers/inline/meta_reference/datasetio/config.py +++ b/llama_stack/providers/inline/datasetio/localfs/config.py @@ -6,4 +6,4 @@ from llama_stack.apis.datasetio import * # noqa: F401, F403 -class MetaReferenceDatasetIOConfig(BaseModel): ... +class LocalFSDatasetIOConfig(BaseModel): ... diff --git a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py similarity index 95% rename from llama_stack/providers/inline/meta_reference/datasetio/datasetio.py rename to llama_stack/providers/inline/datasetio/localfs/datasetio.py index a6fe4feb3..d8c100684 100644 --- a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -15,7 +15,7 @@ from dataclasses import dataclass from llama_stack.providers.datatypes import DatasetsProtocolPrivate from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url -from .config import MetaReferenceDatasetIOConfig +from .config import LocalFSDatasetIOConfig class BaseDataset(ABC): @@ -77,8 +77,8 @@ class PandasDataframeDataset(BaseDataset): self.df = self._validate_dataset_schema(df) -class MetaReferenceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): - def __init__(self, config: MetaReferenceDatasetIOConfig) -> None: +class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): + def __init__(self, config: LocalFSDatasetIOConfig) -> None: self.config = config # local registry for keeping track of datasets within the provider self.dataset_infos = {} diff --git a/llama_stack/providers/inline/meta_reference/eval/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/eval/__init__.py rename to llama_stack/providers/inline/eval/meta_reference/__init__.py diff --git a/llama_stack/providers/inline/meta_reference/eval/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/eval/config.py rename to llama_stack/providers/inline/eval/meta_reference/config.py diff --git a/llama_stack/providers/inline/meta_reference/eval/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py similarity index 99% rename from llama_stack/providers/inline/meta_reference/eval/eval.py rename to llama_stack/providers/inline/eval/meta_reference/eval.py index 48d8e2b04..df642f33b 100644 --- a/llama_stack/providers/inline/meta_reference/eval/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -9,14 +9,13 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from .....apis.common.job_types import Job from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus from llama_stack.apis.common.type_system import * # noqa: F403 -from tqdm import tqdm - from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval_tasks import EvalTaskDef from llama_stack.apis.inference import Inference from llama_stack.apis.scoring import Scoring from llama_stack.providers.datatypes import EvalTasksProtocolPrivate +from tqdm import tqdm from .config import MetaReferenceEvalConfig diff --git a/llama_stack/providers/inline/braintrust/scoring/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/__init__.py rename to llama_stack/providers/inline/scoring/braintrust/__init__.py diff --git a/llama_stack/providers/inline/braintrust/scoring/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py similarity index 98% rename from llama_stack/providers/inline/braintrust/scoring/braintrust.py rename to llama_stack/providers/inline/scoring/braintrust/braintrust.py index 6488a63eb..57723bb47 100644 --- a/llama_stack/providers/inline/braintrust/scoring/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -16,9 +16,8 @@ from llama_stack.apis.datasets import * # noqa: F403 from autoevals.llm import Factuality from autoevals.ragas import AnswerCorrectness from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import ( - aggregate_average, -) + +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average from .config import BraintrustScoringConfig from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def diff --git a/llama_stack/providers/inline/braintrust/scoring/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/config.py rename to llama_stack/providers/inline/scoring/braintrust/config.py diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/__init__.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/__init__.py rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/__init__.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/__init__.py rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/answer_correctness.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/answer_correctness.py rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py diff --git a/llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/factuality.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py similarity index 100% rename from llama_stack/providers/inline/braintrust/scoring/scoring_fn/fn_defs/factuality.py rename to llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/__init__.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/__init__.py rename to llama_stack/providers/inline/scoring/meta_reference/__init__.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/config.py b/llama_stack/providers/inline/scoring/meta_reference/config.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/config.py rename to llama_stack/providers/inline/scoring/meta_reference/config.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring.py b/llama_stack/providers/inline/scoring/meta_reference/scoring.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/__init__.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/__init__.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/__init__.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py similarity index 82% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py index 07405d56c..877b64e4e 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py @@ -4,20 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import ( - BaseScoringFn, -) +from .base_scoring_fn import BaseScoringFn from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import ( - aggregate_accuracy, -) +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.equality import ( - equality, -) +from .fn_defs.equality import equality class EqualityScoringFn(BaseScoringFn): diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/__init__.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/__init__.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/__init__.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/__init__.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py diff --git a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py new file mode 100644 index 000000000..69d96e1bf --- /dev/null +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_base.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ScoringFnDef + + +llm_as_judge_base = ScoringFnDef( + identifier="meta-reference::llm_as_judge_base", + description="Llm As Judge Scoring Function", + return_type=NumberType(), +) diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py similarity index 100% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py similarity index 86% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py index f98f7fb5e..e1f19e640 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py @@ -4,20 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.apis.inference.inference import Inference -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import ( - BaseScoringFn, -) + +from .base_scoring_fn import BaseScoringFn from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 import re -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import ( - aggregate_average, -) -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.llm_as_judge_8b_correctness import ( - llm_as_judge_8b_correctness, -) +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average + +from .fn_defs.llm_as_judge_base import llm_as_judge_base class LlmAsJudgeScoringFn(BaseScoringFn): @@ -29,7 +25,7 @@ class LlmAsJudgeScoringFn(BaseScoringFn): super().__init__(*arg, **kwargs) self.inference_api = inference_api self.supported_fn_defs_registry = { - llm_as_judge_8b_correctness.identifier: llm_as_judge_8b_correctness, + llm_as_judge_base.identifier: llm_as_judge_base, } async def score_row( diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py similarity index 96% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py index 0aff2f535..3cbc6cbe4 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/regex_parser_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py @@ -9,7 +9,7 @@ from .base_scoring_fn import BaseScoringFn from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 -from .common import aggregate_accuracy +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy from .fn_defs.regex_parser_multiple_choice_answer import ( regex_parser_multiple_choice_answer, diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py similarity index 80% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py rename to llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py index 289c63dd7..fe5988160 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py @@ -4,19 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import ( - BaseScoringFn, -) +from .base_scoring_fn import BaseScoringFn from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import ( - aggregate_accuracy, -) +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy -from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.subset_of import ( - subset_of, -) +from .fn_defs.subset_of import subset_of class SubsetOfScoringFn(BaseScoringFn): diff --git a/llama_stack/providers/registry/datasetio.py b/llama_stack/providers/registry/datasetio.py index 3fdeac997..2d1c722f0 100644 --- a/llama_stack/providers/registry/datasetio.py +++ b/llama_stack/providers/registry/datasetio.py @@ -13,10 +13,10 @@ def available_providers() -> List[ProviderSpec]: return [ InlineProviderSpec( api=Api.datasetio, - provider_type="meta-reference", + provider_type="localfs", pip_packages=["pandas"], - module="llama_stack.providers.inline.meta_reference.datasetio", - config_class="llama_stack.providers.inline.meta_reference.datasetio.MetaReferenceDatasetIOConfig", + module="llama_stack.providers.inline.datasetio.localfs", + config_class="llama_stack.providers.inline.datasetio.localfs.LocalFSDatasetIOConfig", api_dependencies=[], ), remote_provider_spec( diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py index 9b9ba6409..275cc92db 100644 --- a/llama_stack/providers/registry/eval.py +++ b/llama_stack/providers/registry/eval.py @@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]: api=Api.eval, provider_type="meta-reference", pip_packages=[], - module="llama_stack.providers.inline.meta_reference.eval", - config_class="llama_stack.providers.inline.meta_reference.eval.MetaReferenceEvalConfig", + module="llama_stack.providers.inline.eval.meta_reference", + config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig", api_dependencies=[ Api.datasetio, Api.datasets, diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py index 2586083f6..70f43ad73 100644 --- a/llama_stack/providers/registry/scoring.py +++ b/llama_stack/providers/registry/scoring.py @@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]: api=Api.scoring, provider_type="meta-reference", pip_packages=[], - module="llama_stack.providers.inline.meta_reference.scoring", - config_class="llama_stack.providers.inline.meta_reference.scoring.MetaReferenceScoringConfig", + module="llama_stack.providers.inline.scoring.meta_reference", + config_class="llama_stack.providers.inline.scoring.meta_reference.MetaReferenceScoringConfig", api_dependencies=[ Api.datasetio, Api.datasets, @@ -27,8 +27,8 @@ def available_providers() -> List[ProviderSpec]: api=Api.scoring, provider_type="braintrust", pip_packages=["autoevals", "openai"], - module="llama_stack.providers.inline.braintrust.scoring", - config_class="llama_stack.providers.inline.braintrust.scoring.BraintrustScoringConfig", + module="llama_stack.providers.inline.scoring.braintrust", + config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", api_dependencies=[ Api.datasetio, Api.datasets, diff --git a/llama_stack/providers/tests/datasetio/fixtures.py b/llama_stack/providers/tests/datasetio/fixtures.py index d810d5e02..6f20bf96a 100644 --- a/llama_stack/providers/tests/datasetio/fixtures.py +++ b/llama_stack/providers/tests/datasetio/fixtures.py @@ -19,12 +19,12 @@ def datasetio_remote() -> ProviderFixture: @pytest.fixture(scope="session") -def datasetio_meta_reference() -> ProviderFixture: +def datasetio_localfs() -> ProviderFixture: return ProviderFixture( providers=[ Provider( - provider_id="meta-reference", - provider_type="meta-reference", + provider_id="localfs", + provider_type="localfs", config={}, ) ], @@ -44,7 +44,7 @@ def datasetio_huggingface() -> ProviderFixture: ) -DATASETIO_FIXTURES = ["meta_reference", "remote", "huggingface"] +DATASETIO_FIXTURES = ["localfs", "remote", "huggingface"] @pytest_asyncio.fixture(scope="session") diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py b/llama_stack/providers/tests/eval/constants.py similarity index 60% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py rename to llama_stack/providers/tests/eval/constants.py index 68d77b8df..0fb1a44c4 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py +++ b/llama_stack/providers/tests/eval/constants.py @@ -4,10 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.apis.scoring_functions import * # noqa: F401, F403 -from llama_stack.apis.scoring import * # noqa: F401, F403 -from llama_stack.apis.common.type_system import NumberType - JUDGE_PROMPT = """ You will be given a question, a expected_answer, and a system_answer. Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question. @@ -22,18 +18,3 @@ System Answer: {generated_answer} Feedback::: Total rating: """ - -llm_as_judge_8b_correctness = ScoringFnDef( - identifier="meta-reference::llm_as_judge_8b_correctness", - description="Llm As Judge Scoring Function", - return_type=NumberType(), - params=LLMAsJudgeScoringFnParams( - prompt_template=JUDGE_PROMPT, - judge_model="Llama3.1-8B-Instruct", - judge_score_regexes=[ - r"Total rating: (\d+)", - r"rating: (\d+)", - r"Rating: (\d+)", - ], - ), -) diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index fdd4dcfbb..9f14c61ef 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -19,9 +19,10 @@ from llama_stack.apis.eval.eval import ( EvalTaskDefWithProvider, ModelCandidate, ) +from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams from llama_stack.distribution.datatypes import Api from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset - +from .constants import JUDGE_PROMPT # How to run this test: # @@ -65,7 +66,7 @@ class Testeval: assert len(rows.rows) == 3 scoring_functions = [ - "meta-reference::llm_as_judge_8b_correctness", + "meta-reference::llm_as_judge_base", "meta-reference::equality", ] task_id = "meta-reference::app_eval" @@ -85,11 +86,22 @@ class Testeval: model="Llama3.2-3B-Instruct", sampling_params=SamplingParams(), ), + scoring_params={ + "meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams( + judge_model="Llama3.1-8B-Instruct", + prompt_template=JUDGE_PROMPT, + judge_score_regexes=[ + r"Total rating: (\d+)", + r"rating: (\d+)", + r"Rating: (\d+)", + ], + ) + }, ), ) assert len(response.generations) == 3 - assert "meta-reference::llm_as_judge_8b_correctness" in response.scores assert "meta-reference::equality" in response.scores + assert "meta-reference::llm_as_judge_base" in response.scores @pytest.mark.asyncio async def test_eval_run_eval(self, eval_stack): @@ -109,7 +121,6 @@ class Testeval: ) scoring_functions = [ - "meta-reference::llm_as_judge_8b_correctness", "meta-reference::subset_of", ] @@ -138,7 +149,6 @@ class Testeval: assert eval_response is not None assert len(eval_response.generations) == 5 assert "meta-reference::subset_of" in eval_response.scores - assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores @pytest.mark.asyncio async def test_eval_run_benchmark_eval(self, eval_stack): diff --git a/llama_stack/providers/tests/scoring/conftest.py b/llama_stack/providers/tests/scoring/conftest.py index ee578f9b3..ed56df230 100644 --- a/llama_stack/providers/tests/scoring/conftest.py +++ b/llama_stack/providers/tests/scoring/conftest.py @@ -16,7 +16,7 @@ DEFAULT_PROVIDER_COMBINATIONS = [ pytest.param( { "scoring": "meta_reference", - "datasetio": "meta_reference", + "datasetio": "localfs", "inference": "fireworks", }, id="meta_reference_scoring_fireworks_inference", @@ -25,12 +25,21 @@ DEFAULT_PROVIDER_COMBINATIONS = [ pytest.param( { "scoring": "meta_reference", - "datasetio": "meta_reference", + "datasetio": "localfs", "inference": "together", }, id="meta_reference_scoring_together_inference", marks=pytest.mark.meta_reference_scoring_together_inference, ), + pytest.param( + { + "scoring": "braintrust", + "datasetio": "localfs", + "inference": "together", + }, + id="braintrust_scoring_together_inference", + marks=pytest.mark.braintrust_scoring_together_inference, + ), ] @@ -38,6 +47,7 @@ def pytest_configure(config): for fixture_name in [ "meta_reference_scoring_fireworks_inference", "meta_reference_scoring_together_inference", + "braintrust_scoring_together_inference", ]: config.addinivalue_line( "markers", diff --git a/llama_stack/providers/tests/scoring/fixtures.py b/llama_stack/providers/tests/scoring/fixtures.py index 925f98779..648d35859 100644 --- a/llama_stack/providers/tests/scoring/fixtures.py +++ b/llama_stack/providers/tests/scoring/fixtures.py @@ -31,7 +31,20 @@ def scoring_meta_reference() -> ProviderFixture: ) -SCORING_FIXTURES = ["meta_reference", "remote"] +@pytest.fixture(scope="session") +def scoring_braintrust() -> ProviderFixture: + return ProviderFixture( + providers=[ + Provider( + provider_id="braintrust", + provider_type="braintrust", + config={}, + ) + ], + ) + + +SCORING_FIXTURES = ["meta_reference", "remote", "braintrust"] @pytest_asyncio.fixture(scope="session") @@ -52,9 +65,4 @@ async def scoring_stack(request): provider_data, ) - return ( - impls[Api.scoring], - impls[Api.scoring_functions], - impls[Api.datasetio], - impls[Api.datasets], - ) + return impls diff --git a/llama_stack/providers/tests/scoring/test_scoring.py b/llama_stack/providers/tests/scoring/test_scoring.py index 3c1b6554f..f3c925048 100644 --- a/llama_stack/providers/tests/scoring/test_scoring.py +++ b/llama_stack/providers/tests/scoring/test_scoring.py @@ -8,7 +8,7 @@ import pytest from llama_stack.apis.scoring_functions import * # noqa: F403 - +from llama_stack.distribution.datatypes import Api from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset # How to run this test: @@ -23,20 +23,36 @@ class TestScoring: async def test_scoring_functions_list(self, scoring_stack): # NOTE: this needs you to ensure that you are starting from a clean state # but so far we don't have an unregister API unfortunately, so be careful - _, scoring_functions_impl, _, _ = scoring_stack + scoring_functions_impl = scoring_stack[Api.scoring_functions] response = await scoring_functions_impl.list_scoring_functions() assert isinstance(response, list) assert len(response) > 0 @pytest.mark.asyncio async def test_scoring_score(self, scoring_stack): - scoring_impl, scoring_functions_impl, datasetio_impl, datasets_impl = ( - scoring_stack + ( + scoring_impl, + scoring_functions_impl, + datasetio_impl, + datasets_impl, + models_impl, + ) = ( + scoring_stack[Api.scoring], + scoring_stack[Api.scoring_functions], + scoring_stack[Api.datasetio], + scoring_stack[Api.datasets], + scoring_stack[Api.models], ) await register_dataset(datasets_impl) response = await datasets_impl.list_datasets() assert len(response) == 1 + for model_id in ["Llama3.2-3B-Instruct", "Llama3.1-8B-Instruct"]: + await models_impl.register_model( + model_id=model_id, + provider_id="", + ) + # scoring individual rows rows = await datasetio_impl.get_rows_paginated( dataset_id="test_dataset", @@ -44,10 +60,11 @@ class TestScoring: ) assert len(rows.rows) == 3 + scoring_fns_list = await scoring_functions_impl.list_scoring_functions() scoring_functions = { - "meta-reference::llm_as_judge_8b_correctness": None, - "meta-reference::equality": None, + scoring_fns_list[0].identifier: None, } + response = await scoring_impl.score( input_rows=rows.rows, scoring_functions=scoring_functions, @@ -69,13 +86,34 @@ class TestScoring: @pytest.mark.asyncio async def test_scoring_score_with_params(self, scoring_stack): - scoring_impl, scoring_functions_impl, datasetio_impl, datasets_impl = ( - scoring_stack + ( + scoring_impl, + scoring_functions_impl, + datasetio_impl, + datasets_impl, + models_impl, + ) = ( + scoring_stack[Api.scoring], + scoring_stack[Api.scoring_functions], + scoring_stack[Api.datasetio], + scoring_stack[Api.datasets], + scoring_stack[Api.models], ) await register_dataset(datasets_impl) response = await datasets_impl.list_datasets() assert len(response) == 1 + for model_id in ["Llama3.1-405B-Instruct"]: + await models_impl.register_model( + model_id=model_id, + provider_id="", + ) + + scoring_fns_list = await scoring_functions_impl.list_scoring_functions() + provider_id = scoring_fns_list[0].provider_id + if provider_id == "braintrust": + pytest.skip("Braintrust provider does not support scoring with params") + # scoring individual rows rows = await datasetio_impl.get_rows_paginated( dataset_id="test_dataset", @@ -84,7 +122,7 @@ class TestScoring: assert len(rows.rows) == 3 scoring_functions = { - "meta-reference::llm_as_judge_8b_correctness": LLMAsJudgeScoringFnParams( + "meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams( judge_model="Llama3.1-405B-Instruct", prompt_template="Output a number response in the following format: Score: , where is the number between 0 and 9.", judge_score_regexes=[r"Score: (\d+)"], diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py b/llama_stack/providers/utils/scoring/aggregation_utils.py similarity index 92% rename from llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py rename to llama_stack/providers/utils/scoring/aggregation_utils.py index 25bac5edc..1ca0c7fb3 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py +++ b/llama_stack/providers/utils/scoring/aggregation_utils.py @@ -3,13 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from pathlib import Path from typing import Any, Dict, List from llama_stack.apis.scoring import ScoringResultRow -FN_DEFS_PATH = Path(__file__).parent / "fn_defs" - def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: num_correct = sum(result["score"] for result in scoring_results)