evals new rebase

This commit is contained in:
Xi Yan 2024-10-10 11:35:26 -07:00
parent 89d24a07f0
commit 31c046dcdf
28 changed files with 1141 additions and 87 deletions

View file

@ -0,0 +1,19 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import MetaReferenceEvalsImplConfig # noqa
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.distribution.datatypes import Api, ProviderSpec
async def get_provider_impl(
config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec]
):
from .evals import MetaReferenceEvalsImpl
impl = MetaReferenceEvalsImpl(config, deps[Api.inference])
await impl.initialize()
return impl

View file

@ -0,0 +1,10 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pydantic import BaseModel
class MetaReferenceEvalsImplConfig(BaseModel): ...

View file

@ -0,0 +1,71 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.apis.evals import * # noqa: F403
from termcolor import cprint
from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
from .config import MetaReferenceEvalsImplConfig
class MetaReferenceEvalsImpl(Evals):
def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference):
self.inference_api = inference_api
async def initialize(self) -> None:
pass
async def shutdown(self) -> None:
pass
async def run_evals(
self,
model: str,
task: str,
dataset: Optional[str] = None,
eval_task_config: Optional[EvaluateTaskConfig] = None,
) -> EvaluateResponse:
cprint(
f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
"red",
)
if not dataset:
raise ValueError("dataset must be specified for mete-reference evals")
dataset = DatasetRegistry.get_dataset(dataset)
dataset.load()
task_impl = TaskRegistry.get_task(task)(dataset)
x1 = task_impl.preprocess()
# TODO: replace w/ batch inference & async return eval job
generation_outputs = []
if eval_task_config is None:
eval_task_config = EvaluateTaskConfig(n_samples=len(x1))
if eval_task_config.n_samples is None or eval_task_config.n_samples > len(x1):
eval_task_config.n_samples = len(x1)
print(
f"Eval generation start, generate on {eval_task_config.n_samples} samples"
)
for msg in x1[: eval_task_config.n_samples]:
print("generation for msg: ", msg)
response = await self.inference_api.chat_completion(
model=model,
messages=[msg],
stream=False,
)
generation_outputs.append(response.completion_message.content)
x2 = task_impl.postprocess(generation_outputs)
eval_results = task_impl.score(x2)
eval_response = task_impl.aggregate_results(eval_results)
return eval_response

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,150 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
from llama_stack.apis.evals import * # noqa: F403
from llama_stack.distribution.registry.tasks.task import BaseTask
QUERY_TEMPLATE_MULTICHOICE = """
Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
{Question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
MULTILINGUAL_ANSWER_REGEXES = [
r"Answer\s*:",
r"Answer\s*:", # Korean invisible character
r"উত্তর\s*:",
r"उत्तर\s*:",
r"উত্তরঃ",
r"উত্তর\s*:",
r"Antwort\s*:",
r"답변\s*:",
r"정답\s*:",
r"\s*:",
r"答案\s*",
r"答案\s*:",
r"\s*",
r"\s*:",
r"答复\s*",
r"答曰\s*",
r"الإجابة:",
r"الجواب:",
r"إجابة:",
r"الإجابة النهائية:",
r"الإجابة الصحيحة:",
r"الإجابة الصحيحة هي:",
r"الإجابة هي:",
r"Respuesta\s*:",
r"Risposta\s*:",
r"答え\s*:",
r"答え\s*",
r"回答\s*:",
r"回答\s*",
r"解答\s*:",
r"Jawaban\s*:",
r"Réponse\s*:",
r"Resposta\s*:",
r"Jibu\s*:",
r"Idahun\s*:",
r"Ìdáhùn\s*:",
r"Idáhùn\s*:",
r"Àmọ̀nà\s*:",
r"Àdáhùn\s*:",
r"Ànúgọ\s*:",
r"Àṣàyàn\s*:",
]
MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[]|[]|[]|[])"
)
def normalize_response(response: str) -> str:
"""
Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
"""
return (
response.replace("**", "")
.replace("$\\boxed{", "")
.replace("}$", "")
.replace("\\$", "")
.replace("$\\text{", "")
.replace("$", "")
.replace("\\mathrm{", "")
.replace("\\{", "")
.replace("\\text", "")
.replace("\\(", "")
.replace("\\mathbf{", "")
.replace("{", "")
.replace("\\boxed", "")
)
def normalize_extracted_answer(extracted_answer: str) -> str:
return (
# In arabic these are the letters used for A-D in multiple choice questions
extracted_answer.replace("أ", " A")
.replace("ب", " B")
.replace("ج", " C")
.replace("د", " D")
# In Bengali these are the letters used for A-D in multiple choice questions
.replace("", " A")
.replace("", " B")
.replace("", " C")
.replace("", " D")
# In Japanese these are the letters sometimes used for A-D in multiple choice questions
.replace("", " A")
.replace("", " B")
.replace("", " C")
.replace("", " D")
.strip()
)
class MMLUTask(BaseTask):
"""
MMLU Task.
"""
def __init__(self, dataset, *args, **kwargs):
super().__init__(dataset, *args, **kwargs)
def preprocess_sample(self, sample):
content = QUERY_TEMPLATE_MULTICHOICE.format(**sample)
return {
"role": "user",
"content": content,
}
def postprocess_sample(self, sample):
normalized = normalize_response(sample)
return normalized
def score_sample(self, sample, expected):
extracted_answer = None
for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
match = re.search(regex, sample)
if match:
extracted_answer = normalize_extracted_answer(match.group(1))
break
score = (
1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0
)
# TODO: generalize this into SingleEvalResult
return score
def aggregate_results(self, eval_results):
return EvaluateResponse(
metrics={"score": str(sum(eval_results) / len(eval_results))}
)