get task

2025-12-08 19:10:56 +00:00 · 2024-10-03 17:31:46 -07:00 · 2024-10-03 17:31:46 -07:00 · 4f07aca309
commit 4f07aca309
parent 8339b2cef3
6 changed files with 188 additions and 1 deletions
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@ -6,6 +6,14 @@

 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
+from termcolor import cprint
+
+from llama_stack.providers.impls.meta_reference.evals.datas.utils import (  # noqa: F403
+    get_dataset,
+)
+from llama_stack.providers.impls.meta_reference.evals.tasks.utils import (  # noqa: F403
+    get_task,
+)

 from .config import MetaReferenceEvalsImplConfig

@ -26,7 +34,29 @@ class MetaReferenceEvalsImpl(Evals):
        dataset: str,
        task: str,
    ) -> EvaluateResponse:
-        print("hi")
+        cprint(f"model={model}, dataset={dataset}, task={task}", "red")
+
+        # resolve dataset
+        # - either a custom URL dataset or HF URL dataset
+        dataset = get_dataset("mmlu_eval")
+        print(dataset.dataset)
+
+        # # resolve task and execute task
+        task_impl = get_task(task, dataset)
+        print(task_impl)
+
+        # # F1: this will generate a preprocessed list of input messages for model
+        # x1 = task_impl.preprocess(dataset)
+
+        # # call inference API w/ model
+        # generation_outputs = ["response1", "response2", "response3"]
+
+        # # F2: post process
+        # x2 = task_impl.postprocess(generation_outputs)
+
+        # # F3: score generation outputs
+        # scores = task_impl.score(x2)
+
        return EvaluateResponse(
            metrics={
                "accuracy": 0.5,