mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-09 11:20:58 +00:00
generator + scorer Api for MMLU
This commit is contained in:
parent
fb565dfb06
commit
a25aff290e
14 changed files with 618 additions and 131 deletions
|
|
@ -3,16 +3,27 @@
|
|||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
import json
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import (
|
||||
AggregateScorer,
|
||||
)
|
||||
|
||||
from llama_stack.apis.inference import * # noqa: F403
|
||||
from llama_stack.apis.evals import * # noqa: F403
|
||||
from llama_stack.apis.dataset import * # noqa: F403
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
|
||||
from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import (
|
||||
MMLUProcessor,
|
||||
)
|
||||
|
||||
# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
|
||||
# from .tasks.run_eval_task import RunEvalTask
|
||||
from .scorer.basic_scorers import * # noqa: F403
|
||||
|
||||
from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
|
||||
|
||||
from .config import MetaReferenceEvalsImplConfig
|
||||
|
||||
|
|
@ -27,7 +38,7 @@ class MetaReferenceEvalsImpl(Evals):
|
|||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def run_evals(
|
||||
async def run_eval_task(
|
||||
self,
|
||||
model: str,
|
||||
task: str,
|
||||
|
|
@ -38,43 +49,142 @@ class MetaReferenceEvalsImpl(Evals):
|
|||
f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
|
||||
"red",
|
||||
)
|
||||
|
||||
if not dataset:
|
||||
raise ValueError("dataset must be specified for mete-reference evals")
|
||||
|
||||
dataset = DatasetRegistry.get_dataset(dataset)
|
||||
dataset.load()
|
||||
if not eval_task_config:
|
||||
# construct eval task config from inputs
|
||||
eval_task_config = EvaluateTaskConfig(
|
||||
dataset_config=EvaluateDatasetConfig(
|
||||
dataset_name=dataset,
|
||||
row_limit=2,
|
||||
),
|
||||
generation_config=EvaluateModelGenerationConfig(
|
||||
model=model,
|
||||
),
|
||||
scoring_config=EvaluateScoringConfig(
|
||||
scorer_config_list=[
|
||||
EvaluateSingleScorerConfig(scorer_name="accuracy"),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
task_impl = TaskRegistry.get_task(task)()
|
||||
preprocessed = task_impl.preprocess(dataset)
|
||||
# TODO: wrap inside task
|
||||
# run_task = RunEvalTask(
|
||||
# eval_task_config=eval_task_config,
|
||||
# )
|
||||
# eval_result = run_task.run()
|
||||
|
||||
# TODO: replace w/ batch inference & async return eval job
|
||||
generation_outputs = []
|
||||
if eval_task_config is None:
|
||||
eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
|
||||
if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
|
||||
preprocessed
|
||||
):
|
||||
eval_task_config.n_samples = len(preprocessed)
|
||||
|
||||
print(
|
||||
f"Eval generation start, generate on {eval_task_config.n_samples} samples"
|
||||
dataset = DatasetRegistry.get_dataset(
|
||||
eval_task_config.dataset_config.dataset_name
|
||||
)
|
||||
dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
|
||||
print(f"Running on {len(dataset)} samples")
|
||||
|
||||
for sample in preprocessed[: eval_task_config.n_samples]:
|
||||
# F1
|
||||
processor = MMLUProcessor()
|
||||
preprocessed = processor.preprocess(dataset)
|
||||
|
||||
# Generation
|
||||
# TODO: wrap inside BaseGenerator
|
||||
generation_outputs = []
|
||||
for sample in preprocessed:
|
||||
print("generation: ", sample)
|
||||
response = await self.inference_api.chat_completion(
|
||||
model=model,
|
||||
messages=sample.preprocessed["messages"],
|
||||
messages=sample.generation_input.messages,
|
||||
stream=False,
|
||||
)
|
||||
sample.prediction = PredictionSample(
|
||||
completion_message=response.completion_message.content
|
||||
)
|
||||
generation_outputs.append(sample)
|
||||
cprint(f"response: {response}", "cyan")
|
||||
|
||||
postprocessed = task_impl.postprocess(generation_outputs)
|
||||
eval_results = task_impl.score(postprocessed)
|
||||
aggr_result = task_impl.aggregate_results(eval_results)
|
||||
return EvaluateResponse(
|
||||
eval_result=aggr_result,
|
||||
generation_outputs.append(
|
||||
GenerationResponseSample(
|
||||
generation_output=GenerationOutput(
|
||||
completion_message=response.completion_message.content
|
||||
)
|
||||
)
|
||||
)
|
||||
cprint(generation_outputs, "green")
|
||||
|
||||
# F2
|
||||
postprocessed = processor.postprocess(generation_outputs, dataset)
|
||||
cprint(postprocessed, "blue")
|
||||
|
||||
# F3 - scorer
|
||||
scorer = AggregateScorer(
|
||||
scorers=[
|
||||
AccuracyScorer(),
|
||||
RandomScorer(),
|
||||
]
|
||||
)
|
||||
|
||||
scorer_results = scorer.score(postprocessed)
|
||||
cprint(scorer_results, "magenta")
|
||||
eval_result = scorer.aggregate_results(scorer_results)
|
||||
|
||||
return EvaluateResponse(
|
||||
eval_result=eval_result,
|
||||
formatted_report=json.dumps(eval_result.json(), indent=4),
|
||||
)
|
||||
|
||||
async def run_scorer(
|
||||
self,
|
||||
dataset_config: EvaluateDatasetConfig,
|
||||
eval_scoring_config: EvaluateScoringConfig,
|
||||
) -> EvaluateResponse:
|
||||
return EvaluateResponse(
|
||||
eval_result={},
|
||||
)
|
||||
|
||||
# async def run_evals(
|
||||
# self,
|
||||
# model: str,
|
||||
# task: str,
|
||||
# dataset: Optional[str] = None,
|
||||
# eval_task_config: Optional[EvaluateTaskConfig] = None,
|
||||
# ) -> EvaluateResponse:
|
||||
# cprint(
|
||||
# f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
|
||||
# "red",
|
||||
# )
|
||||
# if not dataset:
|
||||
# raise ValueError("dataset must be specified for mete-reference evals")
|
||||
|
||||
# dataset = DatasetRegistry.get_dataset(dataset)
|
||||
# dataset.load()
|
||||
|
||||
# task_impl = TaskRegistry.get_task(task)()
|
||||
# preprocessed = task_impl.preprocess(dataset)
|
||||
|
||||
# # TODO: replace w/ batch inference & async return eval job
|
||||
# generation_outputs = []
|
||||
# if eval_task_config is None:
|
||||
# eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
|
||||
# if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
|
||||
# preprocessed
|
||||
# ):
|
||||
# eval_task_config.n_samples = len(preprocessed)
|
||||
|
||||
# print(
|
||||
# f"Eval generation start, generate on {eval_task_config.n_samples} samples"
|
||||
# )
|
||||
|
||||
# for sample in preprocessed[: eval_task_config.n_samples]:
|
||||
# print("generation: ", sample)
|
||||
# response = await self.inference_api.chat_completion(
|
||||
# model=model,
|
||||
# messages=sample.preprocessed["messages"],
|
||||
# stream=False,
|
||||
# )
|
||||
# sample.prediction = PredictionSample(
|
||||
# completion_message=response.completion_message.content
|
||||
# )
|
||||
# generation_outputs.append(sample)
|
||||
|
||||
# postprocessed = task_impl.postprocess(generation_outputs)
|
||||
# eval_results = task_impl.score(postprocessed)
|
||||
# aggr_result = task_impl.aggregate_results(eval_results)
|
||||
# return EvaluateResponse(
|
||||
# eval_result=aggr_result,
|
||||
# )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
|
@ -7,8 +7,6 @@ import re
|
|||
|
||||
from llama_stack.apis.evals import * # noqa: F403
|
||||
|
||||
# from llama_stack.distribution.registry.tasks.task import BaseTask
|
||||
|
||||
QUERY_TEMPLATE_MULTICHOICE = """
|
||||
Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
|
||||
|
||||
|
|
@ -112,60 +110,78 @@ def normalize_extracted_answer(extracted_answer: str) -> str:
|
|||
)
|
||||
|
||||
|
||||
class MMLUTask(BaseTask[DictSample, ProcessedDictSample]):
|
||||
class MMLUProcessor(
|
||||
BaseGeneratorProcessor[
|
||||
DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
|
||||
]
|
||||
):
|
||||
"""
|
||||
MMLU Task.
|
||||
Generator processor for MMLU
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def preprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
|
||||
def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
|
||||
content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data)
|
||||
preprocessed = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": content,
|
||||
}
|
||||
],
|
||||
}
|
||||
processed_sample = ProcessedDictSample(
|
||||
data=sample.data,
|
||||
preprocessed=preprocessed,
|
||||
preprocessed_msgs = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": content,
|
||||
}
|
||||
]
|
||||
processed_sample = PreprocessedSample(
|
||||
generation_input=GenerationInput(
|
||||
messages=preprocessed_msgs,
|
||||
)
|
||||
)
|
||||
return processed_sample
|
||||
|
||||
def postprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
|
||||
if not sample.postprocessed:
|
||||
sample.postprocessed = {}
|
||||
sample.postprocessed["postprocessed"] = normalize_response(
|
||||
sample.prediction.completion_message
|
||||
)
|
||||
return sample
|
||||
def postprocess_sample(
|
||||
self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
|
||||
) -> ScorerInputSample:
|
||||
response_text = generation_sample.generation_output.completion_message
|
||||
normalized_response = normalize_response(response_text)
|
||||
|
||||
def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
|
||||
postprocessed_output = sample.postprocessed["postprocessed"]
|
||||
expected_answer = sample.data["Answer"]
|
||||
|
||||
extracted_answer = None
|
||||
# extract answer
|
||||
extracted_answer = ""
|
||||
for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
|
||||
regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
|
||||
match = re.search(regex, postprocessed_output)
|
||||
match = re.search(regex, normalized_response)
|
||||
if match:
|
||||
extracted_answer = normalize_extracted_answer(match.group(1))
|
||||
break
|
||||
|
||||
score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
|
||||
|
||||
return SingleEvalResult(
|
||||
score_data={
|
||||
"score": score,
|
||||
},
|
||||
return ScorerInputSample(
|
||||
generation_output=PostprocessedGeneration(
|
||||
completion_message=response_text,
|
||||
transformed_generation=extracted_answer,
|
||||
),
|
||||
expected_output=dataset_sample.data["Answer"],
|
||||
)
|
||||
|
||||
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
print("aggregate_results", eval_results)
|
||||
sum_score = sum([result.score_data["score"] for result in eval_results])
|
||||
# def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
|
||||
# postprocessed_output = sample.postprocessed["postprocessed"]
|
||||
# expected_answer = sample.data["Answer"]
|
||||
|
||||
return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
|
||||
# extracted_answer = None
|
||||
# for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
|
||||
# regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
|
||||
# match = re.search(regex, postprocessed_output)
|
||||
# if match:
|
||||
# extracted_answer = normalize_extracted_answer(match.group(1))
|
||||
# break
|
||||
|
||||
# score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
|
||||
|
||||
# return SingleEvalResult(
|
||||
# score_data={
|
||||
# "score": score,
|
||||
# },
|
||||
# )
|
||||
|
||||
# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
# print("aggregate_results", eval_results)
|
||||
# sum_score = sum([result.score_data["score"] for result in eval_results])
|
||||
|
||||
# return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
import random
|
||||
|
||||
from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
|
||||
from llama_stack.apis.dataset.dataset import * # noqa: F401 F403
|
||||
|
||||
|
||||
class AggregateScorer(BaseScorer[ScorerInputSample]):
|
||||
def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
|
||||
self.scorers = scorers
|
||||
|
||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||
all_score_data = {}
|
||||
for scorer in self.scorers:
|
||||
score_data = scorer.score_sample(scorer_input_sample).score_data
|
||||
for k, v in score_data.items():
|
||||
all_score_data[k] = v
|
||||
|
||||
return SingleEvalResult(
|
||||
score_data=all_score_data,
|
||||
)
|
||||
|
||||
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
all_metrics = {}
|
||||
|
||||
for scorer in self.scorers:
|
||||
metrics = scorer.aggregate_results(eval_results).metrics
|
||||
for k, v in metrics.items():
|
||||
all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
|
||||
|
||||
return EvalResult(
|
||||
metrics=all_metrics,
|
||||
)
|
||||
|
||||
|
||||
class RandomScorer(BaseScorer[ScorerInputSample]):
|
||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||
return SingleEvalResult(score_data={"random": random.random()})
|
||||
|
||||
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
avg_random = sum(
|
||||
[result.score_data["random"] for result in eval_results]
|
||||
) / len(eval_results)
|
||||
max_random = max([result.score_data["random"] for result in eval_results])
|
||||
return EvalResult(
|
||||
metrics={
|
||||
"avg_random": avg_random,
|
||||
"max_random": max_random,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class AccuracyScorer(BaseScorer[ScorerInputSample]):
|
||||
def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
|
||||
extracted_answer = scorer_input_sample.generation_output.transformed_generation
|
||||
expected_answer = scorer_input_sample.expected_output
|
||||
|
||||
accuracy = (
|
||||
1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
|
||||
)
|
||||
|
||||
return SingleEvalResult(score_data={"accuracy": accuracy})
|
||||
|
||||
def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
num_correct = sum([result.score_data["accuracy"] for result in eval_results])
|
||||
num_total = len(eval_results)
|
||||
|
||||
return EvalResult(
|
||||
metrics={
|
||||
"avg_accuracy": num_correct / num_total,
|
||||
"num_correct": num_correct,
|
||||
"num_total": num_total,
|
||||
}
|
||||
)
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
|
||||
|
||||
from llama_stack.apis.evals import * # noqa: F403
|
||||
|
||||
|
||||
class RunEvalTask(BaseTask):
|
||||
"""
|
||||
RunEvalTask for LlamaStack
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
eval_task_config,
|
||||
generator_processor: Optional[BaseGeneratorProcessor] = None,
|
||||
generator: Optional[BaseGenerator] = None,
|
||||
scorer: Optional[BaseScorer] = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
generator_processor=generator_processor,
|
||||
generator=generator,
|
||||
scorer=scorer,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
self.eval_task_config = eval_task_config
|
||||
self.dataset = DatasetRegistry.get_dataset(
|
||||
eval_task_config.dataset_config.dataset_name
|
||||
)
|
||||
|
||||
def run(self, *args, **kwargs) -> EvalResult:
|
||||
print(f"Running eval task on {self.dataset}")
|
||||
return EvalResult()
|
||||
Loading…
Add table
Add a link
Reference in a new issue