From 3c29108b6ed107b41e1c887f9276ebad95f267be Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 21:17:16 -0700 Subject: [PATCH] input query optional input for braintrust scorer --- llama_stack/apis/evals/client.py | 2 +- llama_stack/distribution/registry/scorers/__init__.py | 2 +- .../impls/meta_reference/evals/tasks/run_scoring_task.py | 5 ++++- llama_stack/providers/registry/evals.py | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 1db7afac1..b79547713 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -118,7 +118,7 @@ async def run_main(host: str, port: int): response = await client.run_scorer( dataset_config=EvaluateDatasetConfig( dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - # row_limit=10, + row_limit=10, ), eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 60e03b2fe..7cbe2a426 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -16,7 +16,7 @@ ScorerRegistry = Registry[BaseScorer]() SCORER_REGISTRY = { "accuracy": AccuracyScorer, "random": RandomScorer, - "braintrust::factuality": BrainTrustFactualityScorer, + "braintrust::factuality": BraintrustFactualityScorer, "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, } diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py index f856debe9..9e4821a73 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -33,11 +33,15 @@ class RunScoringTask(BaseTask): for x in dataset: expected_answer = x.data["expected_answer"] generated_answer = x.data["generated_answer"] + input_query = None + if "input_query" in x.data: + input_query = x.data["input_query"] scorer_inputs.append( ScorerInputSample( expected_answer=expected_answer, generated_answer=generated_answer, + input_query=input_query, ) ) @@ -74,7 +78,6 @@ class RunScoringTask(BaseTask): ) scorer_results = scorer.score(postprocessed) - cprint(scorer_results, "magenta") eval_result = scorer.aggregate_results(scorer_results) return eval_result diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index 8693ec603..6ea4c16f5 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -20,6 +20,8 @@ def available_providers() -> List[ProviderSpec]: "pandas", "scikit-learn", "datasets", + "numpy", + "autoevals", ], module="llama_stack.providers.impls.meta_reference.evals", config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",