diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index d4d2acad3..a52615ed8 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -35,8 +35,7 @@ class MetaReferenceEvalsImpl(Evals): task: str, ) -> EvaluateResponse: cprint(f"model={model}, dataset={dataset}, task={task}", "red") - dataset = get_dataset("mmlu-simple-eval-en") - + dataset = get_dataset(dataset) task_impl = get_task(task, dataset) x1 = task_impl.preprocess() @@ -52,11 +51,6 @@ class MetaReferenceEvalsImpl(Evals): generation_outputs.append(x.completion_message.content) x2 = task_impl.postprocess(generation_outputs) - scores = task_impl.score(x2) - print(scores) - - return EvaluateResponse( - metrics={ - "accuracy": 0.5, - } - ) + eval_results = task_impl.score(x2) + eval_response = task_impl.aggregate_results(eval_results) + return eval_response diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py index 6a4b5bc0f..4f2939db1 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -6,6 +6,7 @@ import re from .task import BaseTask +from llama_stack.apis.evals import * # noqa: F403 QUERY_TEMPLATE_MULTICHOICE = """ Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. @@ -119,9 +120,6 @@ class MMLUTask(BaseTask): super().__init__(dataset, *args, **kwargs) def preprocess_sample(self, sample): - """ - F1: preprocess sample - """ content = QUERY_TEMPLATE_MULTICHOICE.format(**sample) return { "role": "user", @@ -129,16 +127,10 @@ class MMLUTask(BaseTask): } def postprocess_sample(self, sample): - """ - F2: postprocess sample - """ normalized = normalize_response(sample) return normalized def score_sample(self, sample, expected): - """ - F3: score sample - """ extracted_answer = None for answer_regex in MULTILINGUAL_ANSWER_REGEXES: regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) @@ -149,4 +141,10 @@ class MMLUTask(BaseTask): score = ( 1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0 ) + # TODO: generalize this into SingleEvalResult return score + + def aggregate_results(self, eval_results): + return EvaluateResponse( + metrics={"score": sum(eval_results) / len(eval_results)} + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/task.py index 37114b70f..a5461165b 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/task.py @@ -21,23 +21,18 @@ class BaseTask(ABC): @abstractmethod def preprocess_sample(self, sample): - """ - F1: preprocess sample - """ raise NotImplementedError() @abstractmethod def postprocess_sample(self, sample): - """ - F2: postprocess sample - """ raise NotImplementedError() @abstractmethod def score_sample(self, sample, ground_truth): - """ - F3: score sample - """ + raise NotImplementedError() + + @abstractmethod + def aggregate_results(self, eval_results): raise NotImplementedError() def preprocess(self):