From 4f07aca30954d49422e415538b62d3fa0642348d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 3 Oct 2024 17:31:46 -0700 Subject: [PATCH] get task --- .../meta_reference/evals/datas/datasets.py | 40 +++++++++ .../impls/meta_reference/evals/datas/utils.py | 19 +++++ .../impls/meta_reference/evals/evals.py | 32 +++++++- .../impls/meta_reference/evals/tasks/tasks.py | 81 +++++++++++++++++++ .../impls/meta_reference/evals/tasks/utils.py | 16 ++++ llama_stack/providers/registry/evals.py | 1 + 6 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 llama_stack/providers/impls/meta_reference/evals/datas/datasets.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/datas/utils.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/utils.py diff --git a/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py b/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py new file mode 100644 index 000000000..276c54ef6 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/datas/datasets.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from urllib.parse import parse_qs, urlparse + +import pandas +from datasets import Dataset, load_dataset + + +class BaseDataset: + def __init__(self, name: str): + self.dataset = None + self.dataset_id = name + self.type = self.__class__.__name__ + + +class CustomDataset(BaseDataset): + def __init__(self, name, url): + super().__init__(name) + self.url = url + df = pandas.read_csv(self.url) + self.dataset = Dataset.from_pandas(df) + + +class HFDataset(BaseDataset): + def __init__(self, name, url): + super().__init__(name) + # URL following OpenAI's evals - hf://hendrycks_test?name=business_ethics&split=validation + self.url = url + parsed = urlparse(url) + query = parse_qs(parsed.query) + query = {k: v[0] for k, v in query.items()} + + if parsed.scheme != "hf": + raise ValueError(f"Unknown HF dataset: {url}") + + self.dataset = load_dataset(path, **query) diff --git a/llama_stack/providers/impls/meta_reference/evals/datas/utils.py b/llama_stack/providers/impls/meta_reference/evals/datas/utils.py new file mode 100644 index 000000000..a1dd7849d --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/datas/utils.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from .datasets import CustomDataset + +# TODO: make this into a config based registry +DATASETS_REGISTRY = { + "mmlu_eval": CustomDataset( + name="mmlu_eval", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), +} + + +def get_dataset(dataset_id: str): + # get dataset concrete dataset implementation + return DATASETS_REGISTRY[dataset_id] diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index c68414c43..b433f875f 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -6,6 +6,14 @@ from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 +from termcolor import cprint + +from llama_stack.providers.impls.meta_reference.evals.datas.utils import ( # noqa: F403 + get_dataset, +) +from llama_stack.providers.impls.meta_reference.evals.tasks.utils import ( # noqa: F403 + get_task, +) from .config import MetaReferenceEvalsImplConfig @@ -26,7 +34,29 @@ class MetaReferenceEvalsImpl(Evals): dataset: str, task: str, ) -> EvaluateResponse: - print("hi") + cprint(f"model={model}, dataset={dataset}, task={task}", "red") + + # resolve dataset + # - either a custom URL dataset or HF URL dataset + dataset = get_dataset("mmlu_eval") + print(dataset.dataset) + + # # resolve task and execute task + task_impl = get_task(task, dataset) + print(task_impl) + + # # F1: this will generate a preprocessed list of input messages for model + # x1 = task_impl.preprocess(dataset) + + # # call inference API w/ model + # generation_outputs = ["response1", "response2", "response3"] + + # # F2: post process + # x2 = task_impl.postprocess(generation_outputs) + + # # F3: score generation outputs + # scores = task_impl.score(x2) + return EvaluateResponse( metrics={ "accuracy": 0.5, diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py b/llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py new file mode 100644 index 000000000..430e08ea5 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/tasks.py @@ -0,0 +1,81 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from abc import ABC, abstractmethod + + +class BaseTask(ABC): + """ + Base class for all evaluation tasks. + Each task needs to implement the following methods: + - F1: preprocess_sample(self) + - F2: postprocess_sample(self) + - F3: score_sample(self) + """ + + def __init__(self, dataset, *args, **kwargs): + super().__init__(*args, **kwargs) + self._name = self.__class__.__name__ + self.dataset = dataset + + @abstractmethod + def preprocess_sample(self, sample): + """ + F1: preprocess sample + """ + raise NotImplementedError() + + @abstractmethod + def postprocess_sample(self, sample): + """ + F2: postprocess sample + """ + raise NotImplementedError() + + @abstractmethod + def score_sample(self, sample, ground_truth): + """ + F3: score sample + """ + raise NotImplementedError() + + def preprocess(self): + pass + + def postprocess(self): + pass + + def score(self, generation): + pass + + +class MMLUTask(BaseTask): + """ + MMLU Task. Each task needs to implement the following methods: + - F1: preprocess_sample(self) + - F2: postprocess_sample(self) + - F3: score_sample(self) + """ + + def __init__(self, dataset, *args, **kwargs): + super().__init__(dataset, *args, **kwargs) + + def preprocess_sample(self, sample): + """ + F1: preprocess sample + """ + pass + + def postprocess_sample(self, sample): + """ + F2: postprocess sample + """ + pass + + def score_sample(self, sample): + """ + F3: score sample + """ + pass diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/utils.py b/llama_stack/providers/impls/meta_reference/evals/tasks/utils.py new file mode 100644 index 000000000..fb94541af --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/utils.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from .tasks import * # noqa: F403 + +# TODO: make this into a config based registry +TASKS_REGISTRY = { + "mmlu": MMLUTask, +} + + +def get_task(task_id: str, dataset): + task_impl = TASKS_REGISTRY[task_id] + return task_impl(dataset) diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index 8f9bacdd6..8c98cb199 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -19,6 +19,7 @@ def available_providers() -> List[ProviderSpec]: "pillow", "pandas", "scikit-learn", + "datasets", ], module="llama_stack.providers.impls.meta_reference.evals", config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",