mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-29 07:14:20 +00:00
braintrust scorer
This commit is contained in:
parent
c8f6849291
commit
7b5895003a
3 changed files with 64 additions and 0 deletions
|
@ -73,10 +73,13 @@ class ScorerInputSample(DatasetSample):
|
||||||
A dataset is required to have the following columns to be used for scoring:
|
A dataset is required to have the following columns to be used for scoring:
|
||||||
- generated_answer: str
|
- generated_answer: str
|
||||||
- expected_answer: Union[str, List[str]]
|
- expected_answer: Union[str, List[str]]
|
||||||
|
- (optional) input_query: str
|
||||||
|
- (optional) generation_output: PostprocessedGeneration
|
||||||
"""
|
"""
|
||||||
|
|
||||||
generated_answer: str
|
generated_answer: str
|
||||||
expected_answer: Union[str, List[str]]
|
expected_answer: Union[str, List[str]]
|
||||||
|
input_query: Optional[str] = None
|
||||||
generation_output: Optional[PostprocessedGeneration] = None
|
generation_output: Optional[PostprocessedGeneration] = None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,14 +6,18 @@
|
||||||
# TODO: make these import config based
|
# TODO: make these import config based
|
||||||
from llama_stack.apis.evals import * # noqa: F403
|
from llama_stack.apis.evals import * # noqa: F403
|
||||||
from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
|
from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
|
||||||
|
from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403
|
||||||
|
|
||||||
from ..registry import Registry
|
from ..registry import Registry
|
||||||
|
|
||||||
|
# TODO: make these import config based
|
||||||
ScorerRegistry = Registry[BaseScorer]()
|
ScorerRegistry = Registry[BaseScorer]()
|
||||||
|
|
||||||
SCORER_REGISTRY = {
|
SCORER_REGISTRY = {
|
||||||
"accuracy": AccuracyScorer,
|
"accuracy": AccuracyScorer,
|
||||||
"random": RandomScorer,
|
"random": RandomScorer,
|
||||||
|
"braintrust::factuality": BrainTrustFactualityScorer,
|
||||||
|
"braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
|
||||||
}
|
}
|
||||||
|
|
||||||
for k, v in SCORER_REGISTRY.items():
|
for k, v in SCORER_REGISTRY.items():
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
|
||||||
|
from llama_stack.apis.datasets.datasets import * # noqa: F401 F403
|
||||||
|
from autoevals.llm import * # noqa: F403
|
||||||
|
from autoevals.ragas import * # noqa: F403
|
||||||
|
|
||||||
|
|
||||||
|
class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
|
||||||
|
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||||
|
input_query = scorer_input_sample.input_query
|
||||||
|
extracted_answer = scorer_input_sample.generated_answer
|
||||||
|
expected_answer = scorer_input_sample.expected_answer
|
||||||
|
|
||||||
|
evaluator = Factuality()
|
||||||
|
result = evaluator(output, expected, input=input_query)
|
||||||
|
factuality = result.score
|
||||||
|
return SingleEvalResult(score_data={"factuality": factuality})
|
||||||
|
|
||||||
|
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||||
|
avg_score = np.average(
|
||||||
|
[result.score_data["factuality"] for result in eval_results]
|
||||||
|
)
|
||||||
|
|
||||||
|
return EvalResult(
|
||||||
|
metrics={
|
||||||
|
"avg_factuality_score": avg_score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
|
||||||
|
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||||
|
input_query = scorer_input_sample.input_query
|
||||||
|
extracted_answer = scorer_input_sample.generated_answer
|
||||||
|
expected_answer = scorer_input_sample.expected_answer
|
||||||
|
|
||||||
|
evaluator = AnswerCorrectness()
|
||||||
|
result = evaluator(output, expected, input=input_query)
|
||||||
|
correctness = result.score
|
||||||
|
return SingleEvalResult(score_data={"answer_correctness": correctness})
|
||||||
|
|
||||||
|
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||||
|
avg_score = np.average(
|
||||||
|
[result.score_data["answer_correctness"] for result in eval_results]
|
||||||
|
)
|
||||||
|
|
||||||
|
return EvalResult(
|
||||||
|
metrics={
|
||||||
|
"avg_correctness_score": avg_score,
|
||||||
|
}
|
||||||
|
)
|
Loading…
Add table
Add a link
Reference in a new issue