test eval works

2025-08-02 08:44:44 +00:00 · 2024-11-06 21:40:38 -08:00 · 2024-11-06 21:40:38 -08:00 · 3f1ac29d57
commit 3f1ac29d57
parent 413a1b6d8b
3 changed files with 126 additions and 13 deletions
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -16,6 +16,10 @@ from llama_stack.apis.datasetio import *  # noqa: F403
 from llama_stack.apis.scoring import *  # noqa: F403
 from llama_stack.apis.eval import *  # noqa: F403

+from llama_stack.providers.inline.meta_reference.eval.eval import (
+    DEFAULT_EVAL_TASK_IDENTIFIER,
+)
+

 class MemoryRouter(Memory):
    """Routes to an provider based on the memory bank identifier"""
@ -280,7 +284,13 @@ class EvalRouter(Eval):
        eval_task_def: EvalTaskDef,
        eval_task_config: EvalTaskConfig,
    ) -> Job:
-        pass
+        # NOTE: We need to use DEFAULT_EVAL_TASK_IDENTIFIER to make the router work for all app evals
+        return await self.routing_table.get_provider_impl(
+            DEFAULT_EVAL_TASK_IDENTIFIER
+        ).run_eval(
+            eval_task_def=eval_task_def,
+            eval_task_config=eval_task_config,
+        )

    @webmethod(route="/eval/evaluate_rows", method="POST")
    async def evaluate_rows(
@ -289,13 +299,27 @@ class EvalRouter(Eval):
        scoring_functions: List[str],
        eval_task_config: EvalTaskConfig,  # type: ignore
    ) -> EvaluateResponse:
-        pass
+        # NOTE: This is to deal with the case where we do not pre-register an eval benchmark_task
+        # We use default DEFAULT_EVAL_TASK_IDENTIFIER as identifier
+        return await self.routing_table.get_provider_impl(
+            DEFAULT_EVAL_TASK_IDENTIFIER
+        ).evaluate_rows(
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            eval_task_config=eval_task_config,
+        )

    async def job_status(self, job_id: str) -> Optional[JobStatus]:
-        pass
+        return await self.routing_table.get_provider_impl(
+            DEFAULT_EVAL_TASK_IDENTIFIER
+        ).job_status(job_id)

    async def job_cancel(self, job_id: str) -> None:
-        pass
+        await self.routing_table.get_provider_impl(
+            DEFAULT_EVAL_TASK_IDENTIFIER
+        ).job_cancel(job_id)

    async def job_result(self, job_id: str) -> EvaluateResponse:
-        pass
+        return await self.routing_table.get_provider_impl(
+            DEFAULT_EVAL_TASK_IDENTIFIER
+        ).job_result(job_id)
--- a/llama_stack/providers/inline/meta_reference/eval/eval.py
+++ b/llama_stack/providers/inline/meta_reference/eval/eval.py
@ -11,7 +11,7 @@ from .....apis.eval.eval import BenchmarkEvalTaskConfig
 from llama_stack.apis.common.type_system import *  # noqa: F403
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import *  # noqa: F403
+from llama_stack.apis.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
 from llama_stack.apis.eval_tasks import EvalTaskDef
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.scoring import Scoring
@ -19,6 +19,8 @@ from llama_stack.providers.datatypes import EvalTasksProtocolPrivate

 from .config import MetaReferenceEvalConfig

+DEFAULT_EVAL_TASK_IDENTIFIER = "meta-reference::app_eval"
+

 class ColumnName(Enum):
    input_query = "input_query"
@ -50,9 +52,18 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):

    async def shutdown(self) -> None: ...

-    async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]:
-        print("HHHH")
-        return []
+    async def list_eval_tasks(self) -> List[EvalTaskDef]:
+        # NOTE: In order to be routed to this provider, the eval task def must have
+        # a EvalTaskDef with identifier defined as DEFAULT_EVAL_TASK_IDENTIFIER
+        # for app eval where eval task benchmark_id is not pre-registered
+        eval_tasks = [
+            EvalTaskDef(
+                identifier=DEFAULT_EVAL_TASK_IDENTIFIER,
+                dataset_id="",
+                scoring_functions=[],
+            )
+        ]
+        return eval_tasks

    async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
        dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
@ -98,10 +109,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
            dataset_id=dataset_id,
            rows_in_page=-1,
        )
-        res = await self.evaluate(
+        res = await self.evaluate_rows(
            input_rows=all_rows.rows,
-            candidate=candidate,
            scoring_functions=scoring_functions,
+            eval_task_config=eval_task_config,
        )

        # TODO: currently needs to wait for generation before returning
@ -140,7 +151,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
                    }
                )
            elif ColumnName.chat_completion_input.value in x:
-                input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
+                chat_completion_input_str = str(
+                    x[ColumnName.chat_completion_input.value]
+                )
+                input_messages = eval(chat_completion_input_str)
                input_messages = [UserMessage(**x) for x in input_messages]
                messages = []
                if candidate.system_message:
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@ -7,6 +7,12 @@

 import pytest

+from llama_models.llama3.api import SamplingParams
+
+from llama_stack.apis.eval.eval import AppEvalTaskConfig, EvalTaskDef, ModelCandidate
+from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
+
+
 # How to run this test:
 #
 # pytest llama_stack/providers/tests/eval/test_eval.py
@ -22,4 +28,73 @@ class Testeval:
        _, eval_tasks_impl, _, _, _, _ = eval_stack
        response = await eval_tasks_impl.list_eval_tasks()
        assert isinstance(response, list)
-        print(response)
+        assert len(response) >= 1
+
+    @pytest.mark.asyncio
+    async def test_eval_evaluate_rows(self, eval_stack):
+        eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
+        await register_dataset(
+            datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
+        )
+        response = await datasets_impl.list_datasets()
+        assert len(response) == 1
+        rows = await datasetio_impl.get_rows_paginated(
+            dataset_id="test_dataset_for_eval",
+            rows_in_page=3,
+        )
+        assert len(rows.rows) == 3
+
+        scoring_functions = [
+            "meta-reference::llm_as_judge_8b_correctness",
+            "meta-reference::equality",
+        ]
+
+        response = await eval_impl.evaluate_rows(
+            input_rows=rows.rows,
+            scoring_functions=scoring_functions,
+            eval_task_config=AppEvalTaskConfig(
+                eval_candidate=ModelCandidate(
+                    model="Llama3.2-3B-Instruct",
+                    sampling_params=SamplingParams(),
+                ),
+            ),
+        )
+        assert len(response.generations) == 3
+        assert "meta-reference::llm_as_judge_8b_correctness" in response.scores
+        assert "meta-reference::equality" in response.scores
+
+    @pytest.mark.asyncio
+    async def test_eval_run_eval(self, eval_stack):
+        eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
+        await register_dataset(
+            datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
+        )
+
+        scoring_functions = [
+            "meta-reference::llm_as_judge_8b_correctness",
+            "meta-reference::subset_of",
+        ]
+
+        response = await eval_impl.run_eval(
+            eval_task_def=EvalTaskDef(
+                # NOTE: this is needed to make the router work for all app evals
+                identifier="meta-reference::app_eval",
+                dataset_id="test_dataset_for_eval",
+                scoring_functions=scoring_functions,
+            ),
+            eval_task_config=AppEvalTaskConfig(
+                eval_candidate=ModelCandidate(
+                    model="Llama3.2-3B-Instruct",
+                    sampling_params=SamplingParams(),
+                ),
+            ),
+        )
+        assert response.job_id == "0"
+        job_status = await eval_impl.job_status(response.job_id)
+        assert job_status and job_status.value == "completed"
+        eval_response = await eval_impl.job_result(response.job_id)
+
+        assert eval_response is not None
+        assert len(eval_response.generations) == 5
+        assert "meta-reference::subset_of" in eval_response.scores
+        assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores