Merge branch 'eval_task_register' into mmlu_benchmark

2025-12-18 23:09:38 +00:00 · 2024-11-07 14:41:50 -08:00 · 2024-11-07 14:41:50 -08:00 · cc6edf6287
commit cc6edf6287
parent 93995ecc4c 6b889651d6
72 changed files with 306 additions and 304 deletions
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@ -52,7 +52,7 @@ class Testeval:
        response = await eval_impl.evaluate_rows(
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
-            eval_task_config=AppEvalTaskConfig(
+            task_config=AppEvalTaskConfig(
                eval_candidate=ModelCandidate(
                    model="Llama3.2-3B-Instruct",
                    sampling_params=SamplingParams(),
@ -76,13 +76,13 @@ class Testeval:
        ]

        response = await eval_impl.run_eval(
-            eval_task_def=EvalTaskDef(
+            task=EvalTaskDef(
                # NOTE: this is needed to make the router work for all app evals
                identifier="meta-reference::app_eval",
                dataset_id="test_dataset_for_eval",
                scoring_functions=scoring_functions,
            ),
-            eval_task_config=AppEvalTaskConfig(
+            task_config=AppEvalTaskConfig(
                eval_candidate=ModelCandidate(
                    model="Llama3.2-3B-Instruct",
                    sampling_params=SamplingParams(),
@ -90,9 +90,13 @@ class Testeval:
            ),
        )
        assert response.job_id == "0"
-        job_status = await eval_impl.job_status(response.job_id)
+        job_status = await eval_impl.job_status(
+            response.job_id, "meta-reference::app_eval"
+        )
        assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(response.job_id)
+        eval_response = await eval_impl.job_result(
+            response.job_id, "meta-reference::app_eval"
+        )

        assert eval_response is not None
        assert len(eval_response.generations) == 5