rename evals related stuff

2025-12-16 10:19:28 +00:00 · 2024-11-11 15:11:07 -05:00 · 2024-11-11 15:11:07 -05:00 · acd055d763
commit acd055d763
parent 2b7d70ba86
31 changed files with 21 additions and 41 deletions
--- a/llama_stack/providers/inline/meta_reference/datasetio/init.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/init.py
--- a/llama_stack/providers/inline/meta_reference/datasetio/config.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/config.py
--- a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
--- a/llama_stack/providers/inline/eval/meta_reference/init.py
+++ b/llama_stack/providers/inline/eval/meta_reference/init.py
--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -9,14 +9,13 @@ from llama_models.llama3.api.datatypes import *  # noqa: F403
 from .....apis.common.job_types import Job
 from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from tqdm import tqdm
-
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval_tasks import EvalTaskDef
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.scoring import Scoring
 from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from tqdm import tqdm

 from .config import MetaReferenceEvalConfig

--- a/llama_stack/providers/inline/scoring/braintrust/init.py
+++ b/llama_stack/providers/inline/scoring/braintrust/init.py
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@ -16,9 +16,8 @@ from llama_stack.apis.datasets import *  # noqa: F403
 from autoevals.llm import Factuality
 from autoevals.ragas import AnswerCorrectness
 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_average,
-)
+
+from ..meta_reference.scoring.scoring_fn.common import aggregate_average

 from .config import BraintrustScoringConfig
 from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
--- a/llama_stack/providers/inline/scoring/braintrust/config.py
+++ b/llama_stack/providers/inline/scoring/braintrust/config.py
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/init.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/init.py
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/init.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/init.py
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
--- a/llama_stack/providers/inline/scoring/meta_reference/init.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/init.py
--- a/llama_stack/providers/inline/scoring/meta_reference/config.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/config.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/init.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/init.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/base_scoring_fn.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/common.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/common.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/equality_scoring_fn.py
@ -4,20 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403

-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_accuracy,
-)
-
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.equality import (
-    equality,
-)
+from .common import aggregate_accuracy
+from .fn_defs.equality import equality


 class EqualityScoringFn(BaseScoringFn):
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/init.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/init.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/equality.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/fn_defs/subset_of.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/llm_as_judge_scoring_fn.py
@ -4,20 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.inference.inference import Inference
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
 import re

-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_average,
-)
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.llm_as_judge_8b_correctness import (
-    llm_as_judge_8b_correctness,
-)
+from .common import aggregate_average
+from .fn_defs.llm_as_judge_8b_correctness import llm_as_judge_8b_correctness


 class LlmAsJudgeScoringFn(BaseScoringFn):
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/regex_parser_scoring_fn.py
--- a/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/meta_reference/scoring_fn/subset_of_scoring_fn.py
@ -4,19 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
-    BaseScoringFn,
-)
+from .base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
-    aggregate_accuracy,
-)
+from .common import aggregate_accuracy

-from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.subset_of import (
-    subset_of,
-)
+from .fn_defs.subset_of import subset_of


 class SubsetOfScoringFn(BaseScoringFn):
--- a/llama_stack/providers/registry/datasetio.py
+++ b/llama_stack/providers/registry/datasetio.py
@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.datasetio,
            provider_type="meta-reference",
            pip_packages=["pandas"],
-            module="llama_stack.providers.inline.meta_reference.datasetio",
-            config_class="llama_stack.providers.inline.meta_reference.datasetio.MetaReferenceDatasetIOConfig",
+            module="llama_stack.providers.inline.datasetio.meta_reference",
+            config_class="llama_stack.providers.inline.datasetio.meta_reference.MetaReferenceDatasetIOConfig",
            api_dependencies=[],
        ),
        remote_provider_spec(
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.eval,
            provider_type="meta-reference",
            pip_packages=[],
-            module="llama_stack.providers.inline.meta_reference.eval",
-            config_class="llama_stack.providers.inline.meta_reference.eval.MetaReferenceEvalConfig",
+            module="llama_stack.providers.inline.eval.meta_reference",
+            config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
            api_dependencies=[
                Api.datasetio,
                Api.datasets,
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@ -15,8 +15,8 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.scoring,
            provider_type="meta-reference",
            pip_packages=[],
-            module="llama_stack.providers.inline.meta_reference.scoring",
-            config_class="llama_stack.providers.inline.meta_reference.scoring.MetaReferenceScoringConfig",
+            module="llama_stack.providers.inline.scoring.meta_reference",
+            config_class="llama_stack.providers.inline.scoring.meta_reference.MetaReferenceScoringConfig",
            api_dependencies=[
                Api.datasetio,
                Api.datasets,
@ -27,8 +27,8 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.scoring,
            provider_type="braintrust",
            pip_packages=["autoevals", "openai"],
-            module="llama_stack.providers.inline.braintrust.scoring",
-            config_class="llama_stack.providers.inline.braintrust.scoring.BraintrustScoringConfig",
+            module="llama_stack.providers.inline.scoring.braintrust",
+            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
            api_dependencies=[
                Api.datasetio,
                Api.datasets,