diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index ee270b291..c0aa4d161 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -103,6 +103,7 @@ class HuggingfaceDatasetDef(BaseModel): ) rename_columns_map: Optional[Dict[str, str]] = Field( description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, ) kwargs: Dict[str, Any] = Field( description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", @@ -119,6 +120,10 @@ class CustomDatasetDef(BaseModel): url: str = Field( description="The URL to the dataset", ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, + ) DatasetDef = Annotated[ diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 7d812817b..07877c13e 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -136,6 +136,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): dataset_def=CustomDatasetDef( identifier="rag-evals", url=data_url_from_file(eval_dataset_path), + rename_columns_map={ + "query": "input_query", + }, ) ) cprint(response, "cyan") @@ -150,6 +153,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), + EvaluateSingleScorerConfig( + scorer_name="braintrust::answer-correctness" + ), ] ), ) diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py index 410ad394a..93cbd9ab2 100644 --- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -70,6 +70,9 @@ class CustomDataset(BaseDataset[DictSample]): df = df.sample(n=n_samples) self.dataset = Dataset.from_pandas(df) + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) class HuggingfaceDataset(BaseDataset[DictSample]): diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py index 5dd4eb383..c124aaad6 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py @@ -14,11 +14,11 @@ from autoevals.ragas import * # noqa: F403 class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: input_query = scorer_input_sample.input_query - extracted_answer = scorer_input_sample.generated_answer + generated_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer evaluator = Factuality() - result = evaluator(output, expected, input=input_query) + result = evaluator(generated_answer, expected_answer, input=input_query) factuality = result.score return SingleEvalResult(score_data={"factuality": factuality}) @@ -37,11 +37,11 @@ class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: input_query = scorer_input_sample.input_query - extracted_answer = scorer_input_sample.generated_answer + generated_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer evaluator = AnswerCorrectness() - result = evaluator(output, expected, input=input_query) + result = evaluator(generated_answer, expected_answer, input=input_query) correctness = result.score return SingleEvalResult(score_data={"answer_correctness": correctness})