[rag evals][2/n] add more braintrust scoring fns for RAG eval (#666)

# What does this PR do? - add more braintrust scoring functions for RAG eval - add tests for evaluating against context ## Test Plan ``` pytest -v -s -m braintrust_scoring_together_inference scoring/test_scoring.py ``` <img width="850" alt="image" src="https://github.com/user-attachments/assets/2f8f0693-ea13-422c-a183-f798faf86433" /> **Example Output** - https://gist.github.com/yanxi0830/2acf3b8b3e8132fda2a48b1f0a49711b <img width="827" alt="image" src="https://github.com/user-attachments/assets/9014b957-107c-4c23-bbc0-812cbd0b16da" /> <img width="436" alt="image" src="https://github.com/user-attachments/assets/21e9da17-f426-49b2-9113-855cab7b3d40" /> ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
2025-12-23 04:32:25 +00:00 · 2025-01-02 11:19:22 -08:00 · 2025-01-02 11:19:22 -08:00 · 2da455f48e
commit 2da455f48e
parent eb92322c3c
12 changed files with 276 additions and 12 deletions
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@ -7,7 +7,16 @@ import os
 from typing import Any, Dict, List, Optional

 from autoevals.llm import Factuality
-from autoevals.ragas import AnswerCorrectness
+from autoevals.ragas import (
+    AnswerCorrectness,
+    AnswerRelevancy,
+    AnswerSimilarity,
+    ContextEntityRecall,
+    ContextPrecision,
+    ContextRecall,
+    ContextRelevancy,
+    Faithfulness,
+)
 from pydantic import BaseModel

 from llama_stack.apis.datasetio import DatasetIO
@ -19,7 +28,7 @@ from llama_stack.apis.scoring import (
    ScoringResult,
    ScoringResultRow,
 )
-from llama_stack.apis.scoring_functions import ScoringFn
+from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams

 from llama_stack.distribution.datatypes import Api

@ -33,7 +42,14 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
 from .config import BraintrustScoringConfig
 from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
+from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def
+from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def
+from .scoring_fn.fn_defs.context_entity_recall import context_entity_recall_fn_def
+from .scoring_fn.fn_defs.context_precision import context_precision_fn_def
+from .scoring_fn.fn_defs.context_recall import context_recall_fn_def
+from .scoring_fn.fn_defs.context_relevancy import context_relevancy_fn_def
 from .scoring_fn.fn_defs.factuality import factuality_fn_def
+from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def


 class BraintrustScoringFnEntry(BaseModel):
@ -53,6 +69,41 @@ SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY = [
        evaluator=AnswerCorrectness(),
        fn_def=answer_correctness_fn_def,
    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::answer-relevancy",
+        evaluator=AnswerRelevancy(),
+        fn_def=answer_relevancy_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::answer-similarity",
+        evaluator=AnswerSimilarity(),
+        fn_def=answer_similarity_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::faithfulness",
+        evaluator=Faithfulness(),
+        fn_def=faithfulness_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::context-entity-recall",
+        evaluator=ContextEntityRecall(),
+        fn_def=context_entity_recall_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::context-precision",
+        evaluator=ContextPrecision(),
+        fn_def=context_precision_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::context-recall",
+        evaluator=ContextRecall(),
+        fn_def=context_recall_fn_def,
+    ),
+    BraintrustScoringFnEntry(
+        identifier="braintrust::context-relevancy",
+        evaluator=ContextRelevancy(),
+        fn_def=context_relevancy_fn_def,
+    ),
 ]


@ -143,6 +194,7 @@ class BraintrustScoringImpl(
    async def score_row(
        self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None
    ) -> ScoringResultRow:
+        self.validate_row_schema(input_row, get_valid_schemas(Api.scoring.value))
        await self.set_api_key()
        assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
        expected_answer = input_row["expected_answer"]
@ -154,6 +206,7 @@ class BraintrustScoringImpl(
            generated_answer,
            expected_answer,
            input=input_query,
+            context=input_row["context"] if "context" in input_row else None,
        )
        score = result.score
        return {"score": score, "metadata": result.metadata}
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+answer_relevancy_fn_def = ScoringFn(
+    identifier="braintrust::answer-relevancy",
+    description=(
+        "Test output relevancy against the input query using Braintrust LLM scorer. "
+        "See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="answer-relevancy",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+answer_similarity_fn_def = ScoringFn(
+    identifier="braintrust::answer-similarity",
+    description=(
+        "Test output similarity against expected value using Braintrust LLM scorer. "
+        "See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="answer-similarity",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+context_entity_recall_fn_def = ScoringFn(
+    identifier="braintrust::context-entity-recall",
+    description=(
+        "Evaluates how well the context captures the named entities present in the "
+        "reference answer. See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="context-entity-recall",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+context_precision_fn_def = ScoringFn(
+    identifier="braintrust::context-precision",
+    description=(
+        "Measures how much of the provided context is actually relevant to answering the "
+        "question. See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="context-precision",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+context_recall_fn_def = ScoringFn(
+    identifier="braintrust::context-recall",
+    description=(
+        "Evaluates how well the context covers the information needed to answer the "
+        "question. See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="context-recall",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+context_relevancy_fn_def = ScoringFn(
+    identifier="braintrust::context-relevancy",
+    description=(
+        "Assesses how relevant the provided context is to the given question. "
+        "See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="context-relevancy",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+faithfulness_fn_def = ScoringFn(
+    identifier="braintrust::faithfulness",
+    description=(
+        "Test output faithfulness to the input query using Braintrust LLM scorer. "
+        "See: github.com/braintrustdata/autoevals"
+    ),
+    provider_id="braintrust",
+    provider_resource_id="faithfulness",
+    return_type=NumberType(),
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.average]
+    ),
+)