diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index 453e35f6d..996291dcc 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -47,7 +47,7 @@ class Scoring(Protocol): async def score_batch( self, dataset_id: str, - scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, + scoring_functions: Dict[str, Optional[ScoringFnParams]], save_results_dataset: bool = False, ) -> ScoreBatchResponse: ... @@ -55,5 +55,5 @@ class Scoring(Protocol): async def score( self, input_rows: List[Dict[str, Any]], - scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, + scoring_functions: Dict[str, Optional[ScoringFnParams]], ) -> ScoreResponse: ... diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 00630132e..b555c9f2a 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -3,23 +3,24 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from enum import Enum from typing import Any, Dict, List, Optional from tqdm import tqdm -from llama_stack.apis.agents import Agents -from llama_stack.apis.common.type_system import ( - ChatCompletionInputType, - CompletionInputType, - StringType, -) +from llama_stack.apis.agents import Agents, StepType from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.inference import Inference, UserMessage from llama_stack.apis.scoring import Scoring +from llama_stack.distribution.datatypes import Api from llama_stack.providers.datatypes import EvalTasksProtocolPrivate + +from llama_stack.providers.utils.common.data_schema_validator import ( + ColumnName, + DataSchemaValidatorMixin, + get_valid_schemas, +) from llama_stack.providers.utils.kvstore import kvstore_impl from .....apis.common.job_types import Job @@ -30,15 +31,7 @@ from .config import MetaReferenceEvalConfig EVAL_TASKS_PREFIX = "eval_tasks:" -class ColumnName(Enum): - input_query = "input_query" - expected_answer = "expected_answer" - chat_completion_input = "chat_completion_input" - completion_input = "completion_input" - generated_answer = "generated_answer" - - -class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): +class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate, DataSchemaValidatorMixin): def __init__( self, config: MetaReferenceEvalConfig, @@ -82,29 +75,6 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): ) self.eval_tasks[task_def.identifier] = task_def - async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: - dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: - raise ValueError(f"Dataset {dataset_id} does not have a schema defined.") - - expected_schemas = [ - { - ColumnName.input_query.value: StringType(), - ColumnName.expected_answer.value: StringType(), - ColumnName.chat_completion_input.value: ChatCompletionInputType(), - }, - { - ColumnName.input_query.value: StringType(), - ColumnName.expected_answer.value: StringType(), - ColumnName.completion_input.value: CompletionInputType(), - }, - ] - - if dataset_def.dataset_schema not in expected_schemas: - raise ValueError( - f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}" - ) - async def run_eval( self, task_id: str, @@ -114,8 +84,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions - - await self.validate_eval_input_dataset_schema(dataset_id=dataset_id) + dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) + self.validate_dataset_schema( + dataset_def.dataset_schema, get_valid_schemas(Api.eval.value) + ) all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, rows_in_page=( @@ -167,11 +139,21 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): ) ] final_event = turn_response[-1].event.payload - generations.append( - { - ColumnName.generated_answer.value: final_event.turn.output_message.content - } + + # check if there's a memory retrieval step and extract the context + memory_rag_context = None + for step in final_event.turn.steps: + if step.step_type == StepType.memory_retrieval.value: + memory_rag_context = " ".join(x.text for x in step.inserted_context) + + agent_generation = {} + agent_generation[ColumnName.generated_answer.value] = ( + final_event.turn.output_message.content ) + if memory_rag_context: + agent_generation[ColumnName.context.value] = memory_rag_context + + generations.append(agent_generation) return generations diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index f8b30cbcf..f612abda4 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -14,8 +14,13 @@ from llama_stack.apis.scoring import ( ScoringResult, ) from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams -from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate +from llama_stack.distribution.datatypes import Api +from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate +from llama_stack.providers.utils.common.data_schema_validator import ( + DataSchemaValidatorMixin, + get_valid_schemas, +) from .config import BasicScoringConfig from .scoring_fn.equality_scoring_fn import EqualityScoringFn from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn @@ -24,7 +29,9 @@ from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn] -class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): +class BasicScoringImpl( + Scoring, ScoringFunctionsProtocolPrivate, DataSchemaValidatorMixin +): def __init__( self, config: BasicScoringConfig, @@ -61,30 +68,17 @@ class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def register_scoring_function(self, function_def: ScoringFn) -> None: raise NotImplementedError("Register scoring function not implemented yet") - async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: - dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: - raise ValueError( - f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." - ) - - for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.dataset_schema: - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column." - ) - if dataset_def.dataset_schema[required_column].type != "string": - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." - ) - async def score_batch( self, dataset_id: str, scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, save_results_dataset: bool = False, ) -> ScoreBatchResponse: - await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id) + dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) + self.validate_dataset_schema( + dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value) + ) + all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py index 9991c5502..9b0566228 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py @@ -9,12 +9,12 @@ from typing import Any, Dict, Optional from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams -from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn from .fn_defs.equality import equality -class EqualityScoringFn(BaseScoringFn): +class EqualityScoringFn(RegisteredBaseScoringFn): """ A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise. """ diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py index 552f34d46..38014ca6f 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py @@ -9,14 +9,14 @@ from typing import Any, Dict, Optional from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType -from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn from .fn_defs.regex_parser_multiple_choice_answer import ( regex_parser_multiple_choice_answer, ) -class RegexParserScoringFn(BaseScoringFn): +class RegexParserScoringFn(RegisteredBaseScoringFn): """ A scoring_fn that parses answer from generated response according to context and check match with expected_answer. """ diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py index 29ae12e44..71defc433 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py @@ -8,12 +8,12 @@ from typing import Any, Dict, Optional from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams -from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn from .fn_defs.subset_of import subset_of -class SubsetOfScoringFn(BaseScoringFn): +class SubsetOfScoringFn(RegisteredBaseScoringFn): """ A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise. """ diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index 0c6102645..4282ef6ec 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -7,7 +7,17 @@ import os from typing import Any, Dict, List, Optional from autoevals.llm import Factuality -from autoevals.ragas import AnswerCorrectness +from autoevals.ragas import ( + AnswerCorrectness, + AnswerRelevancy, + AnswerSimilarity, + ContextEntityRecall, + ContextPrecision, + ContextRecall, + ContextRelevancy, + Faithfulness, +) +from pydantic import BaseModel from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets @@ -18,20 +28,90 @@ from llama_stack.apis.scoring import ( ScoringResult, ScoringResultRow, ) -from llama_stack.apis.scoring_functions import AggregationFunctionType, ScoringFn +from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams + +from llama_stack.distribution.datatypes import Api from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate +from llama_stack.providers.utils.common.data_schema_validator import ( + DataSchemaValidatorMixin, + get_valid_schemas, +) -from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average - +from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics from .config import BraintrustScoringConfig from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def +from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def +from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def +from .scoring_fn.fn_defs.context_entity_recall import context_entity_recall_fn_def +from .scoring_fn.fn_defs.context_precision import context_precision_fn_def +from .scoring_fn.fn_defs.context_recall import context_recall_fn_def +from .scoring_fn.fn_defs.context_relevancy import context_relevancy_fn_def from .scoring_fn.fn_defs.factuality import factuality_fn_def +from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def + + +class BraintrustScoringFnEntry(BaseModel): + identifier: str + evaluator: Any + fn_def: ScoringFn + + +SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY = [ + BraintrustScoringFnEntry( + identifier="braintrust::factuality", + evaluator=Factuality(), + fn_def=factuality_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::answer-correctness", + evaluator=AnswerCorrectness(), + fn_def=answer_correctness_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::answer-relevancy", + evaluator=AnswerRelevancy(), + fn_def=answer_relevancy_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::answer-similarity", + evaluator=AnswerSimilarity(), + fn_def=answer_similarity_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::faithfulness", + evaluator=Faithfulness(), + fn_def=faithfulness_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-entity-recall", + evaluator=ContextEntityRecall(), + fn_def=context_entity_recall_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-precision", + evaluator=ContextPrecision(), + fn_def=context_precision_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-recall", + evaluator=ContextRecall(), + fn_def=context_recall_fn_def, + ), + BraintrustScoringFnEntry( + identifier="braintrust::context-relevancy", + evaluator=ContextRelevancy(), + fn_def=context_relevancy_fn_def, + ), +] class BraintrustScoringImpl( - Scoring, ScoringFunctionsProtocolPrivate, NeedsRequestProviderData + Scoring, + ScoringFunctionsProtocolPrivate, + NeedsRequestProviderData, + DataSchemaValidatorMixin, ): def __init__( self, @@ -44,12 +124,12 @@ class BraintrustScoringImpl( self.datasets_api = datasets_api self.braintrust_evaluators = { - "braintrust::factuality": Factuality(), - "braintrust::answer-correctness": AnswerCorrectness(), + entry.identifier: entry.evaluator + for entry in SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY } self.supported_fn_defs_registry = { - factuality_fn_def.identifier: factuality_fn_def, - answer_correctness_fn_def.identifier: answer_correctness_fn_def, + entry.identifier: entry.fn_def + for entry in SUPPORTED_BRAINTRUST_SCORING_FN_ENTRY } async def initialize(self) -> None: ... @@ -70,23 +150,6 @@ class BraintrustScoringImpl( "Registering scoring function not allowed for braintrust provider" ) - async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: - dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: - raise ValueError( - f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." - ) - - for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.dataset_schema: - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column." - ) - if dataset_def.dataset_schema[required_column].type != "string": - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." - ) - async def set_api_key(self) -> None: # api key is in the request headers if not self.config.openai_api_key: @@ -102,11 +165,16 @@ class BraintrustScoringImpl( async def score_batch( self, dataset_id: str, - scoring_functions: List[str], + scoring_functions: Dict[str, Optional[ScoringFnParams]], save_results_dataset: bool = False, ) -> ScoreBatchResponse: await self.set_api_key() - await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id) + + dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) + self.validate_dataset_schema( + dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value) + ) + all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, @@ -126,6 +194,7 @@ class BraintrustScoringImpl( async def score_row( self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None ) -> ScoringResultRow: + self.validate_row_schema(input_row, get_valid_schemas(Api.scoring.value)) await self.set_api_key() assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None" expected_answer = input_row["expected_answer"] @@ -133,12 +202,19 @@ class BraintrustScoringImpl( input_query = input_row["input_query"] evaluator = self.braintrust_evaluators[scoring_fn_identifier] - result = evaluator(generated_answer, expected_answer, input=input_query) + result = evaluator( + generated_answer, + expected_answer, + input=input_query, + context=input_row["context"] if "context" in input_row else None, + ) score = result.score return {"score": score, "metadata": result.metadata} async def score( - self, input_rows: List[Dict[str, Any]], scoring_functions: List[str] + self, + input_rows: List[Dict[str, Any]], + scoring_functions: Dict[str, Optional[ScoringFnParams]], ) -> ScoreResponse: await self.set_api_key() res = {} @@ -150,8 +226,17 @@ class BraintrustScoringImpl( await self.score_row(input_row, scoring_fn_id) for input_row in input_rows ] - aggregation_functions = [AggregationFunctionType.average] - agg_results = aggregate_average(score_results) + aggregation_functions = self.supported_fn_defs_registry[ + scoring_fn_id + ].params.aggregation_functions + + # override scoring_fn params if provided + if scoring_functions[scoring_fn_id] is not None: + override_params = scoring_functions[scoring_fn_id] + if override_params.aggregation_functions: + aggregation_functions = override_params.aggregation_functions + + agg_results = aggregate_metrics(score_results, aggregation_functions) res[scoring_fn_id] = ScoringResult( score_rows=score_results, aggregated_results=agg_results, diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py index dc5df8e78..526ba2c37 100644 --- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py @@ -5,14 +5,23 @@ # the root directory of this source tree. from llama_stack.apis.common.type_system import NumberType -from llama_stack.apis.scoring_functions import ScoringFn +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) answer_correctness_fn_def = ScoringFn( identifier="braintrust::answer-correctness", - description="Scores the correctness of the answer based on the ground truth.. One of Braintrust LLM basd scorer https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/llm.py", - params=None, + description=( + "Scores the correctness of the answer based on the ground truth. " + "Uses Braintrust LLM-based scorer from autoevals library." + ), provider_id="braintrust", provider_resource_id="answer-correctness", return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), ) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py new file mode 100644 index 000000000..3e3e6ac87 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +answer_relevancy_fn_def = ScoringFn( + identifier="braintrust::answer-relevancy", + description=( + "Test output relevancy against the input query using Braintrust LLM scorer. " + "See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="answer-relevancy", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py new file mode 100644 index 000000000..bea8dfd53 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +answer_similarity_fn_def = ScoringFn( + identifier="braintrust::answer-similarity", + description=( + "Test output similarity against expected value using Braintrust LLM scorer. " + "See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="answer-similarity", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py new file mode 100644 index 000000000..ac41df000 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_entity_recall_fn_def = ScoringFn( + identifier="braintrust::context-entity-recall", + description=( + "Evaluates how well the context captures the named entities present in the " + "reference answer. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-entity-recall", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py new file mode 100644 index 000000000..ef172d82c --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_precision_fn_def = ScoringFn( + identifier="braintrust::context-precision", + description=( + "Measures how much of the provided context is actually relevant to answering the " + "question. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-precision", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py new file mode 100644 index 000000000..d4561a5d4 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_recall_fn_def = ScoringFn( + identifier="braintrust::context-recall", + description=( + "Evaluates how well the context covers the information needed to answer the " + "question. See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-recall", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py new file mode 100644 index 000000000..06fc86a7b --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +context_relevancy_fn_def = ScoringFn( + identifier="braintrust::context-relevancy", + description=( + "Assesses how relevant the provided context is to the given question. " + "See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="context-relevancy", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py index b733f10c8..a4d597c29 100644 --- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py @@ -5,14 +5,23 @@ # the root directory of this source tree. from llama_stack.apis.common.type_system import NumberType -from llama_stack.apis.scoring_functions import ScoringFn +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) factuality_fn_def = ScoringFn( identifier="braintrust::factuality", - description="Test whether an output is factual, compared to an original (`expected`) value. One of Braintrust LLM basd scorer https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/llm.py", - params=None, + description=( + "Test output factuality against expected value using Braintrust LLM scorer. " + "See: github.com/braintrustdata/autoevals" + ), provider_id="braintrust", provider_resource_id="factuality", return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), ) diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py new file mode 100644 index 000000000..9cffff558 --- /dev/null +++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +faithfulness_fn_def = ScoringFn( + identifier="braintrust::faithfulness", + description=( + "Test output faithfulness to the input query using Braintrust LLM scorer. " + "See: github.com/braintrustdata/autoevals" + ), + provider_id="braintrust", + provider_resource_id="faithfulness", + return_type=NumberType(), + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.average] + ), +) diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py index 09780e6fb..305c13665 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py @@ -16,7 +16,12 @@ from llama_stack.apis.scoring import ( ScoringResult, ) from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams +from llama_stack.distribution.datatypes import Api from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate +from llama_stack.providers.utils.common.data_schema_validator import ( + DataSchemaValidatorMixin, + get_valid_schemas, +) from .config import LlmAsJudgeScoringConfig from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn @@ -25,7 +30,9 @@ from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn LLM_JUDGE_FNS = [LlmAsJudgeScoringFn] -class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): +class LlmAsJudgeScoringImpl( + Scoring, ScoringFunctionsProtocolPrivate, DataSchemaValidatorMixin +): def __init__( self, config: LlmAsJudgeScoringConfig, @@ -65,30 +72,17 @@ class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def register_scoring_function(self, function_def: ScoringFn) -> None: raise NotImplementedError("Register scoring function not implemented yet") - async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: - dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: - raise ValueError( - f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." - ) - - for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.dataset_schema: - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column." - ) - if dataset_def.dataset_schema[required_column].type != "string": - raise ValueError( - f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." - ) - async def score_batch( self, dataset_id: str, scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, save_results_dataset: bool = False, ) -> ScoreBatchResponse: - await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id) + dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) + self.validate_dataset_schema( + dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value) + ) + all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py index 00ea53c8f..027709f74 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py @@ -12,14 +12,14 @@ from llama_stack.apis.inference.inference import Inference from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams -from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn from .fn_defs.llm_as_judge_405b_simpleqa import llm_as_judge_405b_simpleqa from .fn_defs.llm_as_judge_base import llm_as_judge_base -class LlmAsJudgeScoringFn(BaseScoringFn): +class LlmAsJudgeScoringFn(RegisteredBaseScoringFn): """ A scoring_fn that assigns """ diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index 46c99f5b3..cf28045a4 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -38,9 +38,15 @@ def data_url_from_file(file_path: str) -> str: async def register_dataset( - datasets_impl: Datasets, for_generation=False, dataset_id="test_dataset" + datasets_impl: Datasets, + for_generation=False, + for_rag=False, + dataset_id="test_dataset", ): - test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv" + if for_rag: + test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv" + else: + test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv" test_url = data_url_from_file(str(test_file)) if for_generation: @@ -49,6 +55,13 @@ async def register_dataset( "input_query": StringType(), "chat_completion_input": ChatCompletionInputType(), } + elif for_rag: + dataset_schema = { + "expected_answer": StringType(), + "input_query": StringType(), + "generated_answer": StringType(), + "context": StringType(), + } else: dataset_schema = { "expected_answer": StringType(), diff --git a/llama_stack/providers/tests/datasetio/test_rag_dataset.csv b/llama_stack/providers/tests/datasetio/test_rag_dataset.csv new file mode 100644 index 000000000..a0e1fce72 --- /dev/null +++ b/llama_stack/providers/tests/datasetio/test_rag_dataset.csv @@ -0,0 +1,6 @@ +input_query,context,generated_answer,expected_answer +What is the capital of France?,"France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum.",London,Paris +Who is the CEO of Meta?,"Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies.",Mark Zuckerberg,Mark Zuckerberg +What is the largest planet in our solar system?,"The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets.",Jupiter,Jupiter +What is the smallest country in the world?,"Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy.",China,Vatican City +What is the currency of Japan?,"Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871.",Yen,Yen diff --git a/llama_stack/providers/tests/scoring/test_scoring.py b/llama_stack/providers/tests/scoring/test_scoring.py index 2643b8fd6..00dd5d27b 100644 --- a/llama_stack/providers/tests/scoring/test_scoring.py +++ b/llama_stack/providers/tests/scoring/test_scoring.py @@ -60,7 +60,7 @@ class TestScoring: f"{provider_id} provider does not support scoring without params" ) - await register_dataset(datasets_impl) + await register_dataset(datasets_impl, for_rag=True) response = await datasets_impl.list_datasets() assert len(response) == 1 @@ -112,7 +112,7 @@ class TestScoring: scoring_stack[Api.datasets], scoring_stack[Api.models], ) - await register_dataset(datasets_impl) + await register_dataset(datasets_impl, for_rag=True) response = await datasets_impl.list_datasets() assert len(response) == 1 @@ -173,7 +173,7 @@ class TestScoring: scoring_stack[Api.datasets], scoring_stack[Api.models], ) - await register_dataset(datasets_impl) + await register_dataset(datasets_impl, for_rag=True) rows = await datasetio_impl.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, diff --git a/llama_stack/providers/utils/common/__init__.py b/llama_stack/providers/utils/common/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/utils/common/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py new file mode 100644 index 000000000..d9e6cb6b5 --- /dev/null +++ b/llama_stack/providers/utils/common/data_schema_validator.py @@ -0,0 +1,87 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from enum import Enum +from typing import Any, Dict, List + +from llama_stack.apis.common.type_system import ( + ChatCompletionInputType, + CompletionInputType, + StringType, +) + +from llama_stack.distribution.datatypes import Api + + +class ColumnName(Enum): + input_query = "input_query" + expected_answer = "expected_answer" + chat_completion_input = "chat_completion_input" + completion_input = "completion_input" + generated_answer = "generated_answer" + context = "context" + + +VALID_SCHEMAS_FOR_SCORING = [ + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.generated_answer.value: StringType(), + }, + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.generated_answer.value: StringType(), + ColumnName.context.value: StringType(), + }, +] + +VALID_SCHEMAS_FOR_EVAL = [ + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.chat_completion_input.value: ChatCompletionInputType(), + }, + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.completion_input.value: CompletionInputType(), + }, +] + + +def get_valid_schemas(api_str: str): + if api_str == Api.scoring.value: + return VALID_SCHEMAS_FOR_SCORING + elif api_str == Api.eval.value: + return VALID_SCHEMAS_FOR_EVAL + else: + raise ValueError(f"Invalid API string: {api_str}") + + +class DataSchemaValidatorMixin: + def validate_dataset_schema( + self, + dataset_schema: Dict[str, Any], + expected_schemas: List[Dict[str, Any]], + ): + if dataset_schema not in expected_schemas: + raise ValueError( + f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}" + ) + + def validate_row_schema( + self, + input_row: Dict[str, Any], + expected_schemas: List[Dict[str, Any]], + ): + for schema in expected_schemas: + if all(key in input_row for key in schema): + return + + raise ValueError( + f"Input row {input_row} does not match any of the expected schemas in {expected_schemas}" + ) diff --git a/llama_stack/providers/utils/scoring/base_scoring_fn.py b/llama_stack/providers/utils/scoring/base_scoring_fn.py index 2db77fd2b..e0e557374 100644 --- a/llama_stack/providers/utils/scoring/base_scoring_fn.py +++ b/llama_stack/providers/utils/scoring/base_scoring_fn.py @@ -13,12 +13,51 @@ from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metr class BaseScoringFn(ABC): """ - Base interface class for all native scoring_fns. - Each scoring_fn needs to implement the following methods: + Base interface class for Scoring Functions. + Each scoring function needs to implement the following methods: - score_row(self, row) - aggregate(self, scoring_fn_results) """ + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + + @abstractmethod + async def score_row( + self, + input_row: Dict[str, Any], + scoring_fn_identifier: Optional[str] = None, + scoring_params: Optional[ScoringFnParams] = None, + ) -> ScoringResultRow: + raise NotImplementedError() + + @abstractmethod + async def aggregate( + self, + scoring_results: List[ScoringResultRow], + scoring_fn_identifier: Optional[str] = None, + scoring_params: Optional[ScoringFnParams] = None, + ) -> Dict[str, Any]: + raise NotImplementedError() + + @abstractmethod + async def score( + self, + input_rows: List[Dict[str, Any]], + scoring_fn_identifier: Optional[str] = None, + scoring_params: Optional[ScoringFnParams] = None, + ) -> List[ScoringResultRow]: + raise NotImplementedError() + + +class RegisteredBaseScoringFn(BaseScoringFn): + """ + Interface for native scoring functions that are registered in LlamaStack. + """ + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.supported_fn_defs_registry = {}