diff --git a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py index 30ab690a4..d7b596a39 100644 --- a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py +++ b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py @@ -3,4 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .mmlu import mmlu # noqa: F401 + +# # Copyright (c) Meta Platforms, Inc. and affiliates. +# # All rights reserved. +# # +# # This source code is licensed under the terms described in the LICENSE file in +# # the root directory of this source tree. +# from .mmlu import mmlu # noqa: F401 diff --git a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py index dbd14df31..671b4de1c 100644 --- a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py +++ b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py @@ -4,21 +4,27 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_models.llama3.api.datatypes import URL -from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType -from llama_stack.apis.datasetio import DatasetDef +# # Copyright (c) Meta Platforms, Inc. and affiliates. +# # All rights reserved. +# # +# # This source code is licensed under the terms described in the LICENSE file in +# # the root directory of this source tree. -mmlu = DatasetDef( - identifier="mmlu", - url=URL(uri="https://huggingface.co/datasets/llamastack/evals"), - dataset_schema={ - "input_query": StringType(), - "expected_answer": StringType(), - "chat_completion_input": ChatCompletionInputType(), - }, - metadata={ - "path": "llamastack/evals", - "name": "evals__mmlu__details", - "split": "train", - }, -) +# from llama_models.llama3.api.datatypes import URL +# from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType +# from llama_stack.apis.datasetio import DatasetDef + +# mmlu = DatasetDef( +# identifier="mmlu", +# url=URL(uri="https://huggingface.co/datasets/llamastack/evals"), +# dataset_schema={ +# "input_query": StringType(), +# "expected_answer": StringType(), +# "chat_completion_input": ChatCompletionInputType(), +# }, +# metadata={ +# "path": "llamastack/evals", +# "name": "evals__mmlu__details", +# "split": "train", +# }, +# ) diff --git a/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py b/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py index 02a3be8fb..bd6c649df 100644 --- a/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py +++ b/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py @@ -12,8 +12,6 @@ import datasets as hf_datasets from llama_stack.providers.datatypes import DatasetsProtocolPrivate from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url -from .benchmarks import mmlu - from .config import HuggingfaceDatasetIOConfig @@ -37,9 +35,10 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): self.dataset_infos = {} async def initialize(self) -> None: + pass # pre-registered benchmark datasets - pre_registered_datasets = [mmlu] - self.dataset_infos = {x.identifier: x for x in pre_registered_datasets} + # pre_registered_datasets = [mmlu] + # self.dataset_infos = {x.identifier: x for x in pre_registered_datasets} async def shutdown(self) -> None: ... diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 55939752a..fdd4dcfbb 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -7,7 +7,11 @@ import pytest -from llama_models.llama3.api import SamplingParams +from llama_models.llama3.api import SamplingParams, URL + +from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType + +from llama_stack.apis.datasetio.datasetio import DatasetDefWithProvider from llama_stack.apis.eval.eval import ( AppEvalTaskConfig, @@ -153,8 +157,36 @@ class Testeval: assert len(response) > 0 if response[0].provider_id != "huggingface": pytest.skip( - "Only huggingface provider supports pre-registered benchmarks datasets" + "Only huggingface provider supports pre-registered remote datasets" ) + # register dataset + mmlu = DatasetDefWithProvider( + identifier="mmlu", + url=URL(uri="https://huggingface.co/datasets/llamastack/evals"), + dataset_schema={ + "input_query": StringType(), + "expected_answer": StringType(), + "chat_completion_input": ChatCompletionInputType(), + }, + metadata={ + "path": "llamastack/evals", + "name": "evals__mmlu__details", + "split": "train", + }, + provider_id="", + ) + + await datasets_impl.register_dataset(mmlu) + + # register eval task + meta_reference_mmlu = EvalTaskDefWithProvider( + identifier="meta-reference-mmlu", + dataset_id="mmlu", + scoring_functions=["meta-reference::regex_parser_multiple_choice_answer"], + provider_id="", + ) + + await eval_tasks_impl.register_eval_task(meta_reference_mmlu) # list benchmarks response = await eval_tasks_impl.list_eval_tasks()