mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-29 07:14:20 +00:00
rag correctness scorer w/ custom dataset
This commit is contained in:
parent
ec6c63ba57
commit
9cc0a54f0b
4 changed files with 18 additions and 4 deletions
|
@ -103,6 +103,7 @@ class HuggingfaceDatasetDef(BaseModel):
|
||||||
)
|
)
|
||||||
rename_columns_map: Optional[Dict[str, str]] = Field(
|
rename_columns_map: Optional[Dict[str, str]] = Field(
|
||||||
description="A map of column names to rename to fit the schema of eval dataset for scoring",
|
description="A map of column names to rename to fit the schema of eval dataset for scoring",
|
||||||
|
default=None,
|
||||||
)
|
)
|
||||||
kwargs: Dict[str, Any] = Field(
|
kwargs: Dict[str, Any] = Field(
|
||||||
description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
|
description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
|
||||||
|
@ -119,6 +120,10 @@ class CustomDatasetDef(BaseModel):
|
||||||
url: str = Field(
|
url: str = Field(
|
||||||
description="The URL to the dataset",
|
description="The URL to the dataset",
|
||||||
)
|
)
|
||||||
|
rename_columns_map: Optional[Dict[str, str]] = Field(
|
||||||
|
description="A map of column names to rename to fit the schema of eval dataset for scoring",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
DatasetDef = Annotated[
|
DatasetDef = Annotated[
|
||||||
|
|
|
@ -136,6 +136,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
|
||||||
dataset_def=CustomDatasetDef(
|
dataset_def=CustomDatasetDef(
|
||||||
identifier="rag-evals",
|
identifier="rag-evals",
|
||||||
url=data_url_from_file(eval_dataset_path),
|
url=data_url_from_file(eval_dataset_path),
|
||||||
|
rename_columns_map={
|
||||||
|
"query": "input_query",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cprint(response, "cyan")
|
cprint(response, "cyan")
|
||||||
|
@ -150,6 +153,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
|
||||||
eval_scoring_config=EvaluateScoringConfig(
|
eval_scoring_config=EvaluateScoringConfig(
|
||||||
scorer_config_list=[
|
scorer_config_list=[
|
||||||
EvaluateSingleScorerConfig(scorer_name="accuracy"),
|
EvaluateSingleScorerConfig(scorer_name="accuracy"),
|
||||||
|
EvaluateSingleScorerConfig(
|
||||||
|
scorer_name="braintrust::answer-correctness"
|
||||||
|
),
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
|
@ -70,6 +70,9 @@ class CustomDataset(BaseDataset[DictSample]):
|
||||||
df = df.sample(n=n_samples)
|
df = df.sample(n=n_samples)
|
||||||
|
|
||||||
self.dataset = Dataset.from_pandas(df)
|
self.dataset = Dataset.from_pandas(df)
|
||||||
|
if self.config.rename_columns_map:
|
||||||
|
for k, v in self.config.rename_columns_map.items():
|
||||||
|
self.dataset = self.dataset.rename_column(k, v)
|
||||||
|
|
||||||
|
|
||||||
class HuggingfaceDataset(BaseDataset[DictSample]):
|
class HuggingfaceDataset(BaseDataset[DictSample]):
|
||||||
|
|
|
@ -14,11 +14,11 @@ from autoevals.ragas import * # noqa: F403
|
||||||
class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
|
class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
|
||||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||||
input_query = scorer_input_sample.input_query
|
input_query = scorer_input_sample.input_query
|
||||||
extracted_answer = scorer_input_sample.generated_answer
|
generated_answer = scorer_input_sample.generated_answer
|
||||||
expected_answer = scorer_input_sample.expected_answer
|
expected_answer = scorer_input_sample.expected_answer
|
||||||
|
|
||||||
evaluator = Factuality()
|
evaluator = Factuality()
|
||||||
result = evaluator(output, expected, input=input_query)
|
result = evaluator(generated_answer, expected_answer, input=input_query)
|
||||||
factuality = result.score
|
factuality = result.score
|
||||||
return SingleEvalResult(score_data={"factuality": factuality})
|
return SingleEvalResult(score_data={"factuality": factuality})
|
||||||
|
|
||||||
|
@ -37,11 +37,11 @@ class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
|
||||||
class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
|
class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
|
||||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||||
input_query = scorer_input_sample.input_query
|
input_query = scorer_input_sample.input_query
|
||||||
extracted_answer = scorer_input_sample.generated_answer
|
generated_answer = scorer_input_sample.generated_answer
|
||||||
expected_answer = scorer_input_sample.expected_answer
|
expected_answer = scorer_input_sample.expected_answer
|
||||||
|
|
||||||
evaluator = AnswerCorrectness()
|
evaluator = AnswerCorrectness()
|
||||||
result = evaluator(output, expected, input=input_query)
|
result = evaluator(generated_answer, expected_answer, input=input_query)
|
||||||
correctness = result.score
|
correctness = result.score
|
||||||
return SingleEvalResult(score_data={"answer_correctness": correctness})
|
return SingleEvalResult(score_data={"answer_correctness": correctness})
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue