mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-18 16:49:47 +00:00
more scoring function for rag
This commit is contained in:
parent
b94ab8d013
commit
9aa4a405ca
8 changed files with 132 additions and 10 deletions
|
|
@ -15,7 +15,12 @@ from llama_stack.apis.datasets import * # noqa: F403
|
|||
import os
|
||||
|
||||
from autoevals.llm import Factuality
|
||||
from autoevals.ragas import AnswerCorrectness
|
||||
from autoevals.ragas import (
|
||||
AnswerCorrectness,
|
||||
AnswerRelevancy,
|
||||
AnswerSimilarity,
|
||||
Faithfulness,
|
||||
)
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
||||
|
|
@ -27,7 +32,10 @@ from llama_stack.providers.utils.common.data_schema_validator_mixin import (
|
|||
from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
|
||||
from .config import BraintrustScoringConfig
|
||||
from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
|
||||
from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def
|
||||
from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def
|
||||
from .scoring_fn.fn_defs.factuality import factuality_fn_def
|
||||
from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def
|
||||
|
||||
|
||||
class BraintrustScoringFnEntry(BaseModel):
|
||||
|
|
@ -47,6 +55,21 @@ SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY = [
|
|||
evaluator=AnswerCorrectness(),
|
||||
fn_def=answer_correctness_fn_def,
|
||||
),
|
||||
BraintrustScoringFnEntry(
|
||||
identifier="braintrust::answer-relevancy",
|
||||
evaluator=AnswerRelevancy(),
|
||||
fn_def=answer_relevancy_fn_def,
|
||||
),
|
||||
BraintrustScoringFnEntry(
|
||||
identifier="braintrust::answer-similarity",
|
||||
evaluator=AnswerSimilarity(),
|
||||
fn_def=answer_similarity_fn_def,
|
||||
),
|
||||
BraintrustScoringFnEntry(
|
||||
identifier="braintrust::faithfulness",
|
||||
evaluator=Faithfulness(),
|
||||
fn_def=faithfulness_fn_def,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -135,6 +158,7 @@ class BraintrustScoringImpl(
|
|||
async def score_row(
|
||||
self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None
|
||||
) -> ScoringResultRow:
|
||||
self.validate_row_schema_for_scoring(input_row)
|
||||
await self.set_api_key()
|
||||
assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
|
||||
expected_answer = input_row["expected_answer"]
|
||||
|
|
@ -146,6 +170,7 @@ class BraintrustScoringImpl(
|
|||
generated_answer,
|
||||
expected_answer,
|
||||
input=input_query,
|
||||
context=input_row["context"] if "context" in input_row else None,
|
||||
)
|
||||
score = result.score
|
||||
return {"score": score, "metadata": result.metadata}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.common.type_system import NumberType
|
||||
from llama_stack.apis.scoring_functions import (
|
||||
AggregationFunctionType,
|
||||
BasicScoringFnParams,
|
||||
ScoringFn,
|
||||
)
|
||||
|
||||
answer_relevancy_fn_def = ScoringFn(
|
||||
identifier="braintrust::answer-relevancy",
|
||||
description=(
|
||||
"Test output relevancy against the input query using Braintrust LLM scorer. "
|
||||
"See: github.com/braintrustdata/autoevals"
|
||||
),
|
||||
provider_id="braintrust",
|
||||
provider_resource_id="answer-relevancy",
|
||||
return_type=NumberType(),
|
||||
params=BasicScoringFnParams(
|
||||
aggregation_functions=[AggregationFunctionType.average]
|
||||
),
|
||||
)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.common.type_system import NumberType
|
||||
from llama_stack.apis.scoring_functions import (
|
||||
AggregationFunctionType,
|
||||
BasicScoringFnParams,
|
||||
ScoringFn,
|
||||
)
|
||||
|
||||
answer_similarity_fn_def = ScoringFn(
|
||||
identifier="braintrust::answer-similarity",
|
||||
description=(
|
||||
"Test output similarity against expected value using Braintrust LLM scorer. "
|
||||
"See: github.com/braintrustdata/autoevals"
|
||||
),
|
||||
provider_id="braintrust",
|
||||
provider_resource_id="answer-similarity",
|
||||
return_type=NumberType(),
|
||||
params=BasicScoringFnParams(
|
||||
aggregation_functions=[AggregationFunctionType.average]
|
||||
),
|
||||
)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.common.type_system import NumberType
|
||||
from llama_stack.apis.scoring_functions import (
|
||||
AggregationFunctionType,
|
||||
BasicScoringFnParams,
|
||||
ScoringFn,
|
||||
)
|
||||
|
||||
faithfulness_fn_def = ScoringFn(
|
||||
identifier="braintrust::faithfulness",
|
||||
description=(
|
||||
"Test output faithfulness to the input query using Braintrust LLM scorer. "
|
||||
"See: github.com/braintrustdata/autoevals"
|
||||
),
|
||||
provider_id="braintrust",
|
||||
provider_resource_id="faithfulness",
|
||||
return_type=NumberType(),
|
||||
params=BasicScoringFnParams(
|
||||
aggregation_functions=[AggregationFunctionType.average]
|
||||
),
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue