diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 0f4354c3f..2b54ac8f6 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -92,8 +92,14 @@ class HuggingfaceDatasetDef(BaseModel): identifier: str = Field( description="A unique name for the dataset", ) - dataset_name: str = Field( - description="The name of the dataset into HF (e.g. hellawag)", + dataset_path: str = Field( + description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)", + ) + dataset_name: Optional[str] = Field( + description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)", + ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", ) kwargs: Dict[str, Any] = Field( description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index d61de8c39..e7c5a475d 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -51,34 +51,84 @@ class EvaluationClient(Evals): response.raise_for_status() return EvaluateResponse(**response.json()) + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/evals/run_scorer", + json={ + "dataset_config": json.loads(dataset_config.json()), + "eval_scoring_config": json.loads(eval_scoring_config.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=3600, + ) + response.raise_for_status() + return EvaluateResponse(**response.json()) + async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") dataset_client = DatasetsClient(f"http://{host}:{port}") - # Custom Eval Task + # Full Eval Task - # 1. register custom dataset + # # 1. register custom dataset + # response = await dataset_client.create_dataset( + # dataset_def=CustomDatasetDef( + # identifier="mmlu-simple-eval-en", + # url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + # ), + # ) + # cprint(f"datasets/create: {response}", "cyan") + + # # 2. run evals on the registered dataset + # response = await client.run_evals( + # model="Llama3.1-8B-Instruct", + # dataset="mmlu-simple-eval-en", + # task="mmlu", + # ) + + # if response.formatted_report: + # cprint(response.formatted_report, "green") + # else: + # cprint(f"Response: {response}", "green") + + # Scoring Task + + # 1. register huggingface dataset response = await dataset_client.create_dataset( - dataset_def=CustomDatasetDef( - identifier="mmlu-simple-eval-en", - url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - ), + dataset_def=HuggingfaceDatasetDef( + identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", + dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + rename_columns_map={ + "output_parsed_answer": "generated_answer", + "input_correct_responses": "expected_answer", + }, + kwargs={"split": "latest"}, + ) ) - cprint(f"datasets/create: {response}", "cyan") + cprint(response, "cyan") # 2. run evals on the registered dataset - response = await client.run_evals( - model="Llama3.1-8B-Instruct", - dataset="mmlu-simple-eval-en", - task="mmlu", + response = await client.run_scorer( + dataset_config=EvaluateDatasetConfig( + dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + row_limit=10, + ), + eval_scoring_config=EvaluateScoringConfig( + scorer_config_list=[ + EvaluateSingleScorerConfig(scorer_name="accuracy"), + ] + ), ) - if response.formatted_report: - cprint(response.formatted_report, "green") - else: - cprint(f"Response: {response}", "green") + cprint(response, "green") # Eleuther Eval Task # response = await client.run_evals( diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index ea985ad3b..6a3ed8ce2 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -66,7 +66,7 @@ class EvaluationJobCreateResponse(BaseModel): @json_schema_type class EvaluateDatasetConfig(BaseModel): # identifier to previously registered dataset via DatasetDef - dataset_name: str + dataset_identifier: str # limit number of rows to evaluate row_limit: Optional[int] = None kwargs: Optional[Dict[str, Any]] = None diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py index e18165a11..88a487d60 100644 --- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -72,7 +72,18 @@ class HuggingfaceDataset(BaseDataset[DictSample]): self.load() return len(self.dataset) - def load(self): + def load(self, n_samples: Optional[int] = None): if self.dataset: return - self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) + + if self.config.dataset_name: + self.config.kwargs["name"] = self.config.dataset_name + + self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs) + + if n_samples: + self.dataset = self.dataset.select(range(n_samples)) + + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 80bf2dd7a..916e40e3a 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -13,6 +13,7 @@ from llama_stack.apis.datasets import * # noqa: F403 from .config import MetaReferenceEvalsImplConfig from .tasks.run_eval_task import RunEvalTask +from .tasks.run_scoring_task import RunScoringTask class MetaReferenceEvalsImpl(Evals): @@ -44,7 +45,7 @@ class MetaReferenceEvalsImpl(Evals): # construct eval task config from inputs eval_task_config = EvaluateTaskConfig( dataset_config=EvaluateDatasetConfig( - dataset_name=dataset, + dataset_identifier=dataset, row_limit=3, ), processor_config=EvaluateProcessorConfig( @@ -76,8 +77,10 @@ class MetaReferenceEvalsImpl(Evals): ) -> EvaluateResponse: cprint("run_scorer") - # main logic, we need to convert the datset into List[ScorerInputSample] + run_task = RunScoringTask() + eval_result = await run_task.run(dataset_config, eval_scoring_config) return EvaluateResponse( - eval_result={}, + eval_result=eval_result, + formatted_report=json.dumps(eval_result.json(), indent=4), ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index 6099353a8..748f9fc1f 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -31,9 +31,14 @@ class AccuracyScorer(BaseScorer[ScorerInputSample]): extracted_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer - accuracy = ( - 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 - ) + if isinstance(expected_answer, list): + accuracy = ( + 1.0 if extracted_answer and extracted_answer in expected_answer else 0.0 + ) + else: + accuracy = ( + 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + ) return SingleEvalResult(score_data={"accuracy": accuracy}) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index 83f6264c0..bcd842c42 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -43,7 +43,9 @@ class RunEvalTask(BaseTask): print(f"Running eval task w/ {eval_task_config}") print(DatasetRegistry.names()) - dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name) + dataset = DatasetRegistry.get( + eval_task_config.dataset_config.dataset_identifier + ) dataset.load(n_samples=eval_task_config.dataset_config.row_limit) print(f"Running on {len(dataset)} samples") diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py new file mode 100644 index 000000000..f856debe9 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -0,0 +1,80 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.scorers import ScorerRegistry + +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 +from termcolor import cprint + + +class RunScoringTask(BaseTask): + """ + RunScoringTask - only run scoring (F3) based on dataset and scoring config + """ + + def __init__( + self, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + + def transform_score_input_sample( + self, dataset: BaseDataset + ) -> List[ScorerInputSample]: + scorer_inputs = [] + for x in dataset: + expected_answer = x.data["expected_answer"] + generated_answer = x.data["generated_answer"] + + scorer_inputs.append( + ScorerInputSample( + expected_answer=expected_answer, + generated_answer=generated_answer, + ) + ) + + return scorer_inputs + + async def run( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + *args, + **kwargs, + ) -> EvalResult: + print( + f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}" + ) + + dataset = DatasetRegistry.get(dataset_config.dataset_identifier) + dataset.load(n_samples=dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # transform dataset into + postprocessed = self.transform_score_input_sample(dataset) + cprint(postprocessed, "blue") + + # F3 - scorer + scorer_config_list = eval_scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + scorer_list.append(scorer()) + + scorer = AggregateScorer( + scorers=scorer_list, + ) + + scorer_results = scorer.score(postprocessed) + cprint(scorer_results, "magenta") + eval_result = scorer.aggregate_results(scorer_results) + + return eval_result