diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index 0e08cad0d..7b2b00e66 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -19,6 +19,10 @@ from autoevals.ragas import ( AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, + ContextEntityRecall, + ContextPrecision, + ContextRecall, + ContextRelevancy, Faithfulness, ) from pydantic import BaseModel @@ -34,6 +38,10 @@ from .config import BraintrustScoringConfig from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def +from .scoring_fn.fn_defs.context_entity_recall import context_entity_recall_fn_def +from .scoring_fn.fn_defs.context_precision import context_precision_fn_def +from .scoring_fn.fn_defs.context_recall import context_recall_fn_def +from .scoring_fn.fn_defs.context_relevancy import context_relevancy_fn_def from .scoring_fn.fn_defs.factuality import factuality_fn_def from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def @@ -70,6 +78,26 @@ SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY = [ evaluator=Faithfulness(), fn_def=faithfulness_fn_def, ), + BraintrustScoringFnEntry( + identifier="braintrust::context-entity-recall", + evaluator=ContextEntityRecall(), + fn_def=context_entity_recall_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-precision", + evaluator=ContextPrecision(), + fn_def=context_precision_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-recall", + evaluator=ContextRecall(), + fn_def=context_recall_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-relevancy", + evaluator=ContextRelevancy(), + fn_def=context_relevancy_fn_def, + ), ] diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py new file mode 100644 index 000000000..ac41df000 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_entity_recall_fn_def = ScoringFn( + identifier="braintrust::context-entity-recall", + description=( + "Evaluates how well the context captures the named entities present in the " + "reference answer. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-entity-recall", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py new file mode 100644 index 000000000..ef172d82c --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_precision_fn_def = ScoringFn( + identifier="braintrust::context-precision", + description=( + "Measures how much of the provided context is actually relevant to answering the " + "question. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-precision", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py new file mode 100644 index 000000000..d4561a5d4 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_recall_fn_def = ScoringFn( + identifier="braintrust::context-recall", + description=( + "Evaluates how well the context covers the information needed to answer the " + "question. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-recall", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py new file mode 100644 index 000000000..06fc86a7b --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_relevancy_fn_def = ScoringFn( + identifier="braintrust::context-relevancy", + description=( + "Assesses how relevant the provided context is to the given question. " + "See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-relevancy", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +)