diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index b1cb53607..a1f696dff 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -33,6 +33,7 @@ class EvaluationClient(Evals): "task": task, }, headers={"Content-Type": "application/json"}, + timeout=3600, ) response.raise_for_status() return EvaluateResponse(**response.json()) @@ -43,7 +44,7 @@ async def run_main(host: str, port: int): response = await client.run_evals( "Llama3.1-8B-Instruct", - "mmlu.csv", + "mmlu-simple-eval-en", "mmlu", ) cprint(f"evaluate response={response}", "green") diff --git a/llama_stack/apis/inference/client.py b/llama_stack/apis/inference/client.py index 92acc3e14..2aae1cc55 100644 --- a/llama_stack/apis/inference/client.py +++ b/llama_stack/apis/inference/client.py @@ -109,7 +109,7 @@ async def run_main(host: str, port: int, stream: bool): cprint(f"User>{message.content}", "green") iterator = client.chat_completion( model="Llama3.1-8B-Instruct", - messages=[message], + messages=[message, UserMessage(content="write me 3 sentence about the sun.")], stream=stream, ) async for log in EventLogger().log(iterator): diff --git a/llama_stack/providers/impls/meta_reference/evals/datas/utils.py b/llama_stack/providers/impls/meta_reference/evals/datas/dataset_registry.py similarity index 57% rename from llama_stack/providers/impls/meta_reference/evals/datas/utils.py rename to llama_stack/providers/impls/meta_reference/evals/datas/dataset_registry.py index a1dd7849d..ad1c5f372 100644 --- a/llama_stack/providers/impls/meta_reference/evals/datas/utils.py +++ b/llama_stack/providers/impls/meta_reference/evals/datas/dataset_registry.py @@ -3,17 +3,22 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .datasets import CustomDataset +from .datasets import CustomDataset, HFDataset # TODO: make this into a config based registry DATASETS_REGISTRY = { - "mmlu_eval": CustomDataset( + "mmlu-simple-eval-en": CustomDataset( name="mmlu_eval", url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", ), + "mmmu-accounting": HFDataset( + name="mmlu_eval", + url="hf://hellaswag?split=validation&trust_remote_code=True", + ), } def get_dataset(dataset_id: str): - # get dataset concrete dataset implementation - return DATASETS_REGISTRY[dataset_id] + dataset = DATASETS_REGISTRY[dataset_id] + dataset.load() + return dataset diff --git a/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py b/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py index 276c54ef6..7143583c0 100644 --- a/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py +++ b/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py @@ -4,23 +4,35 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from abc import ABC, abstractmethod from urllib.parse import parse_qs, urlparse import pandas from datasets import Dataset, load_dataset -class BaseDataset: +class BaseDataset(ABC): def __init__(self, name: str): self.dataset = None self.dataset_id = name self.type = self.__class__.__name__ + def __iter__(self): + return iter(self.dataset) + + @abstractmethod + def load(self): + pass + class CustomDataset(BaseDataset): def __init__(self, name, url): super().__init__(name) self.url = url + + def load(self): + if self.dataset: + return df = pandas.read_csv(self.url) self.dataset = Dataset.from_pandas(df) @@ -28,13 +40,18 @@ class CustomDataset(BaseDataset): class HFDataset(BaseDataset): def __init__(self, name, url): super().__init__(name) - # URL following OpenAI's evals - hf://hendrycks_test?name=business_ethics&split=validation self.url = url - parsed = urlparse(url) - query = parse_qs(parsed.query) - query = {k: v[0] for k, v in query.items()} + + def load(self): + if self.dataset: + return + + parsed = urlparse(self.url) if parsed.scheme != "hf": - raise ValueError(f"Unknown HF dataset: {url}") + raise ValueError(f"Unknown HF dataset: {self.url}") + query = parse_qs(parsed.query) + query = {k: v[0] for k, v in query.items()} + path = parsed.netloc self.dataset = load_dataset(path, **query) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index b433f875f..d4d2acad3 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -8,10 +8,10 @@ from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 from termcolor import cprint -from llama_stack.providers.impls.meta_reference.evals.datas.utils import ( # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.datas.dataset_registry import ( get_dataset, ) -from llama_stack.providers.impls.meta_reference.evals.tasks.utils import ( # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.tasks.task_registry import ( get_task, ) @@ -35,27 +35,25 @@ class MetaReferenceEvalsImpl(Evals): task: str, ) -> EvaluateResponse: cprint(f"model={model}, dataset={dataset}, task={task}", "red") + dataset = get_dataset("mmlu-simple-eval-en") - # resolve dataset - # - either a custom URL dataset or HF URL dataset - dataset = get_dataset("mmlu_eval") - print(dataset.dataset) - - # # resolve task and execute task task_impl = get_task(task, dataset) - print(task_impl) + x1 = task_impl.preprocess() - # # F1: this will generate a preprocessed list of input messages for model - # x1 = task_impl.preprocess(dataset) + # TODO: replace w/ batch inference & async return eval job + generation_outputs = [] + for msg in x1[:5]: + response = self.inference_api.chat_completion( + model=model, + messages=[msg], + stream=False, + ) + async for x in response: + generation_outputs.append(x.completion_message.content) - # # call inference API w/ model - # generation_outputs = ["response1", "response2", "response3"] - - # # F2: post process - # x2 = task_impl.postprocess(generation_outputs) - - # # F3: score generation outputs - # scores = task_impl.score(x2) + x2 = task_impl.postprocess(generation_outputs) + scores = task_impl.score(x2) + print(scores) return EvaluateResponse( metrics={ diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py new file mode 100644 index 000000000..6a4b5bc0f --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -0,0 +1,152 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import re + +from .task import BaseTask + +QUERY_TEMPLATE_MULTICHOICE = """ +Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +MULTILINGUAL_ANSWER_REGEXES = [ + r"Answer\s*:", + r"Answer\s*:​​​​​​", # Korean invisible character + r"উত্তর\s*:", + r"उत्तर\s*:", + r"উত্তরঃ", + r"উত্তর\s*:", + r"Antwort\s*:", + r"답변\s*:", + r"정답\s*:", + r"답\s*:", + r"答案\s*:", + r"答案\s*:", + r"答\s*:", + r"答\s*:", + r"答复\s*:", + r"答曰\s*:", + r"الإجابة:", + r"الجواب:", + r"إجابة:", + r"الإجابة النهائية:", + r"الإجابة الصحيحة:", + r"الإجابة الصحيحة هي:", + r"الإجابة هي:", + r"Respuesta\s*:", + r"Risposta\s*:", + r"答え\s*:", + r"答え\s*:", + r"回答\s*:", + r"回答\s*:", + r"解答\s*:", + r"Jawaban\s*:", + r"Réponse\s*:", + r"Resposta\s*:", + r"Jibu\s*:", + r"Idahun\s*:", + r"Ìdáhùn\s*:", + r"Idáhùn\s*:", + r"Àmọ̀nà\s*:", + r"Àdáhùn\s*:", + r"Ànúgọ\s*:", + r"Àṣàyàn\s*:", +] + +MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( + r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" +) + + +def normalize_response(response: str) -> str: + """ + Normalize the response by removing markdown and LaTeX formatting that may prevent a match. + """ + + return ( + response.replace("**", "") + .replace("$\\boxed{", "") + .replace("}$", "") + .replace("\\$", "") + .replace("$\\text{", "") + .replace("$", "") + .replace("\\mathrm{", "") + .replace("\\{", "") + .replace("\\text", "") + .replace("\\(", "") + .replace("\\mathbf{", "") + .replace("{", "") + .replace("\\boxed", "") + ) + + +def normalize_extracted_answer(extracted_answer: str) -> str: + return ( + # In arabic these are the letters used for A-D in multiple choice questions + extracted_answer.replace("أ", " A") + .replace("ب", " B") + .replace("ج", " C") + .replace("د", " D") + # In Bengali these are the letters used for A-D in multiple choice questions + .replace("অ", " A") + .replace("ব", " B") + .replace("ড", " C") + .replace("ঢ", " D") + # In Japanese these are the letters sometimes used for A-D in multiple choice questions + .replace("A", " A") + .replace("B", " B") + .replace("C", " C") + .replace("D", " D") + .strip() + ) + + +class MMLUTask(BaseTask): + """ + MMLU Task. + """ + + def __init__(self, dataset, *args, **kwargs): + super().__init__(dataset, *args, **kwargs) + + def preprocess_sample(self, sample): + """ + F1: preprocess sample + """ + content = QUERY_TEMPLATE_MULTICHOICE.format(**sample) + return { + "role": "user", + "content": content, + } + + def postprocess_sample(self, sample): + """ + F2: postprocess sample + """ + normalized = normalize_response(sample) + return normalized + + def score_sample(self, sample, expected): + """ + F3: score sample + """ + extracted_answer = None + for answer_regex in MULTILINGUAL_ANSWER_REGEXES: + regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + match = re.search(regex, sample) + if match: + extracted_answer = normalize_extracted_answer(match.group(1)) + break + score = ( + 1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0 + ) + return score diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py b/llama_stack/providers/impls/meta_reference/evals/tasks/task.py similarity index 55% rename from llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py rename to llama_stack/providers/impls/meta_reference/evals/tasks/task.py index 430e08ea5..37114b70f 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/task.py @@ -8,8 +8,7 @@ from abc import ABC, abstractmethod class BaseTask(ABC): """ - Base class for all evaluation tasks. - Each task needs to implement the following methods: + Base class for all evaluation tasks. Each task needs to implement the following methods: - F1: preprocess_sample(self) - F2: postprocess_sample(self) - F3: score_sample(self) @@ -42,40 +41,13 @@ class BaseTask(ABC): raise NotImplementedError() def preprocess(self): - pass + return [self.preprocess_sample(sample) for sample in self.dataset] - def postprocess(self): - pass + def postprocess(self, generation): + return [self.postprocess_sample(sample) for sample in generation] - def score(self, generation): - pass - - -class MMLUTask(BaseTask): - """ - MMLU Task. Each task needs to implement the following methods: - - F1: preprocess_sample(self) - - F2: postprocess_sample(self) - - F3: score_sample(self) - """ - - def __init__(self, dataset, *args, **kwargs): - super().__init__(dataset, *args, **kwargs) - - def preprocess_sample(self, sample): - """ - F1: preprocess sample - """ - pass - - def postprocess_sample(self, sample): - """ - F2: postprocess sample - """ - pass - - def score_sample(self, sample): - """ - F3: score sample - """ - pass + def score(self, postprocessed): + return [ + self.score_sample(sample, ground_truth) + for sample, ground_truth in zip(postprocessed, self.dataset) + ] diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/utils.py b/llama_stack/providers/impls/meta_reference/evals/tasks/task_registry.py similarity index 91% rename from llama_stack/providers/impls/meta_reference/evals/tasks/utils.py rename to llama_stack/providers/impls/meta_reference/evals/tasks/task_registry.py index fb94541af..69720051d 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/utils.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/task_registry.py @@ -3,7 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .tasks import * # noqa: F403 +from .mmlu_task import MMLUTask # TODO: make this into a config based registry TASKS_REGISTRY = { diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 129b71e34..4a3a98de2 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -36,14 +36,18 @@ api_providers: config: {} routing_table: inference: - - provider_type: meta-reference + # - provider_type: meta-reference + # config: + # model: Llama3.2-1B-Instruct + # quantization: null + # torch_seed: null + # max_seq_len: 4096 + # max_batch_size: 1 + # routing_key: Llama3.2-1B-Instruct + - provider_type: remote::tgi config: - model: Llama3.2-1B - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 - routing_key: Llama3.2-1B + url: http://127.0.0.1:5009 + routing_key: Llama3.1-8B-Instruct safety: - provider_type: meta-reference config: