rag correctness scorer w/ custom dataset

This commit is contained in:
Xi Yan 2024-10-15 00:42:03 -07:00
parent ec6c63ba57
commit 9cc0a54f0b
4 changed files with 18 additions and 4 deletions

View file

@ -103,6 +103,7 @@ class HuggingfaceDatasetDef(BaseModel):
) )
rename_columns_map: Optional[Dict[str, str]] = Field( rename_columns_map: Optional[Dict[str, str]] = Field(
description="A map of column names to rename to fit the schema of eval dataset for scoring", description="A map of column names to rename to fit the schema of eval dataset for scoring",
default=None,
) )
kwargs: Dict[str, Any] = Field( kwargs: Dict[str, Any] = Field(
description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
@ -119,6 +120,10 @@ class CustomDatasetDef(BaseModel):
url: str = Field( url: str = Field(
description="The URL to the dataset", description="The URL to the dataset",
) )
rename_columns_map: Optional[Dict[str, str]] = Field(
description="A map of column names to rename to fit the schema of eval dataset for scoring",
default=None,
)
DatasetDef = Annotated[ DatasetDef = Annotated[

View file

@ -136,6 +136,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
dataset_def=CustomDatasetDef( dataset_def=CustomDatasetDef(
identifier="rag-evals", identifier="rag-evals",
url=data_url_from_file(eval_dataset_path), url=data_url_from_file(eval_dataset_path),
rename_columns_map={
"query": "input_query",
},
) )
) )
cprint(response, "cyan") cprint(response, "cyan")
@ -150,6 +153,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
eval_scoring_config=EvaluateScoringConfig( eval_scoring_config=EvaluateScoringConfig(
scorer_config_list=[ scorer_config_list=[
EvaluateSingleScorerConfig(scorer_name="accuracy"), EvaluateSingleScorerConfig(scorer_name="accuracy"),
EvaluateSingleScorerConfig(
scorer_name="braintrust::answer-correctness"
),
] ]
), ),
) )

View file

@ -70,6 +70,9 @@ class CustomDataset(BaseDataset[DictSample]):
df = df.sample(n=n_samples) df = df.sample(n=n_samples)
self.dataset = Dataset.from_pandas(df) self.dataset = Dataset.from_pandas(df)
if self.config.rename_columns_map:
for k, v in self.config.rename_columns_map.items():
self.dataset = self.dataset.rename_column(k, v)
class HuggingfaceDataset(BaseDataset[DictSample]): class HuggingfaceDataset(BaseDataset[DictSample]):

View file

@ -14,11 +14,11 @@ from autoevals.ragas import * # noqa: F403
class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
input_query = scorer_input_sample.input_query input_query = scorer_input_sample.input_query
extracted_answer = scorer_input_sample.generated_answer generated_answer = scorer_input_sample.generated_answer
expected_answer = scorer_input_sample.expected_answer expected_answer = scorer_input_sample.expected_answer
evaluator = Factuality() evaluator = Factuality()
result = evaluator(output, expected, input=input_query) result = evaluator(generated_answer, expected_answer, input=input_query)
factuality = result.score factuality = result.score
return SingleEvalResult(score_data={"factuality": factuality}) return SingleEvalResult(score_data={"factuality": factuality})
@ -37,11 +37,11 @@ class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
input_query = scorer_input_sample.input_query input_query = scorer_input_sample.input_query
extracted_answer = scorer_input_sample.generated_answer generated_answer = scorer_input_sample.generated_answer
expected_answer = scorer_input_sample.expected_answer expected_answer = scorer_input_sample.expected_answer
evaluator = AnswerCorrectness() evaluator = AnswerCorrectness()
result = evaluator(output, expected, input=input_query) result = evaluator(generated_answer, expected_answer, input=input_query)
correctness = result.score correctness = result.score
return SingleEvalResult(score_data={"answer_correctness": correctness}) return SingleEvalResult(score_data={"answer_correctness": correctness})