From 7b5895003ab2c6feed29e3e960e400b9cc0ab15d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 21:09:59 -0700 Subject: [PATCH] braintrust scorer --- llama_stack/apis/datasets/datasets.py | 3 + .../distribution/registry/scorers/__init__.py | 4 ++ .../evals/scorer/braintrust_scorer.py | 57 +++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 2b54ac8f6..ee270b291 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -73,10 +73,13 @@ class ScorerInputSample(DatasetSample): A dataset is required to have the following columns to be used for scoring: - generated_answer: str - expected_answer: Union[str, List[str]] + - (optional) input_query: str + - (optional) generation_output: PostprocessedGeneration """ generated_answer: str expected_answer: Union[str, List[str]] + input_query: Optional[str] = None generation_output: Optional[PostprocessedGeneration] = None diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index dedf32ac3..60e03b2fe 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -6,14 +6,18 @@ # TODO: make these import config based from llama_stack.apis.evals import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403 from ..registry import Registry +# TODO: make these import config based ScorerRegistry = Registry[BaseScorer]() SCORER_REGISTRY = { "accuracy": AccuracyScorer, "random": RandomScorer, + "braintrust::factuality": BrainTrustFactualityScorer, + "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, } for k, v in SCORER_REGISTRY.items(): diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py new file mode 100644 index 000000000..5dd4eb383 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import numpy as np + +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 +from autoevals.llm import * # noqa: F403 +from autoevals.ragas import * # noqa: F403 + + +class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = Factuality() + result = evaluator(output, expected, input=input_query) + factuality = result.score + return SingleEvalResult(score_data={"factuality": factuality}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["factuality"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_factuality_score": avg_score, + } + ) + + +class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = AnswerCorrectness() + result = evaluator(output, expected, input=input_query) + correctness = result.score + return SingleEvalResult(score_data={"answer_correctness": correctness}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["answer_correctness"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_correctness_score": avg_score, + } + )