diff --git a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py
index 30ab690a4..d7b596a39 100644
--- a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py
+++ b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/__init__.py
@@ -3,4 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from .mmlu import mmlu  # noqa: F401
+
+# # Copyright (c) Meta Platforms, Inc. and affiliates.
+# # All rights reserved.
+# #
+# # This source code is licensed under the terms described in the LICENSE file in
+# # the root directory of this source tree.
+# from .mmlu import mmlu  # noqa: F401
diff --git a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py
index dbd14df31..671b4de1c 100644
--- a/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py
+++ b/llama_stack/providers/adapters/datasetio/huggingface/benchmarks/mmlu.py
@@ -4,21 +4,27 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_models.llama3.api.datatypes import URL
-from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
-from llama_stack.apis.datasetio import DatasetDef
+# # Copyright (c) Meta Platforms, Inc. and affiliates.
+# # All rights reserved.
+# #
+# # This source code is licensed under the terms described in the LICENSE file in
+# # the root directory of this source tree.
 
-mmlu = DatasetDef(
-    identifier="mmlu",
-    url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
-    dataset_schema={
-        "input_query": StringType(),
-        "expected_answer": StringType(),
-        "chat_completion_input": ChatCompletionInputType(),
-    },
-    metadata={
-        "path": "llamastack/evals",
-        "name": "evals__mmlu__details",
-        "split": "train",
-    },
-)
+# from llama_models.llama3.api.datatypes import URL
+# from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
+# from llama_stack.apis.datasetio import DatasetDef
+
+# mmlu = DatasetDef(
+#     identifier="mmlu",
+#     url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
+#     dataset_schema={
+#         "input_query": StringType(),
+#         "expected_answer": StringType(),
+#         "chat_completion_input": ChatCompletionInputType(),
+#     },
+#     metadata={
+#         "path": "llamastack/evals",
+#         "name": "evals__mmlu__details",
+#         "split": "train",
+#     },
+# )
diff --git a/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py b/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py
index 02a3be8fb..bd6c649df 100644
--- a/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/adapters/datasetio/huggingface/huggingface.py
@@ -12,8 +12,6 @@ import datasets as hf_datasets
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url
 
-from .benchmarks import mmlu
-
 from .config import HuggingfaceDatasetIOConfig
 
 
@@ -37,9 +35,10 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         self.dataset_infos = {}
 
     async def initialize(self) -> None:
+        pass
         # pre-registered benchmark datasets
-        pre_registered_datasets = [mmlu]
-        self.dataset_infos = {x.identifier: x for x in pre_registered_datasets}
+        # pre_registered_datasets = [mmlu]
+        # self.dataset_infos = {x.identifier: x for x in pre_registered_datasets}
 
     async def shutdown(self) -> None: ...
 
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index 55939752a..fdd4dcfbb 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -7,7 +7,11 @@
 
 import pytest
 
-from llama_models.llama3.api import SamplingParams
+from llama_models.llama3.api import SamplingParams, URL
+
+from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
+
+from llama_stack.apis.datasetio.datasetio import DatasetDefWithProvider
 
 from llama_stack.apis.eval.eval import (
     AppEvalTaskConfig,
@@ -153,8 +157,36 @@ class Testeval:
         assert len(response) > 0
         if response[0].provider_id != "huggingface":
             pytest.skip(
-                "Only huggingface provider supports pre-registered benchmarks datasets"
+                "Only huggingface provider supports pre-registered remote datasets"
             )
+        # register dataset
+        mmlu = DatasetDefWithProvider(
+            identifier="mmlu",
+            url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
+            dataset_schema={
+                "input_query": StringType(),
+                "expected_answer": StringType(),
+                "chat_completion_input": ChatCompletionInputType(),
+            },
+            metadata={
+                "path": "llamastack/evals",
+                "name": "evals__mmlu__details",
+                "split": "train",
+            },
+            provider_id="",
+        )
+
+        await datasets_impl.register_dataset(mmlu)
+
+        # register eval task
+        meta_reference_mmlu = EvalTaskDefWithProvider(
+            identifier="meta-reference-mmlu",
+            dataset_id="mmlu",
+            scoring_functions=["meta-reference::regex_parser_multiple_choice_answer"],
+            provider_id="",
+        )
+
+        await eval_tasks_impl.register_eval_task(meta_reference_mmlu)
 
         # list benchmarks
         response = await eval_tasks_impl.list_eval_tasks()