diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index f77e03928..18c78b06c 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -16,6 +16,10 @@ from llama_stack.apis.datasetio import * # noqa: F403 from llama_stack.apis.scoring import * # noqa: F403 from llama_stack.apis.eval import * # noqa: F403 +from llama_stack.providers.inline.meta_reference.eval.eval import ( + DEFAULT_EVAL_TASK_IDENTIFIER, +) + class MemoryRouter(Memory): """Routes to an provider based on the memory bank identifier""" @@ -280,7 +284,13 @@ class EvalRouter(Eval): eval_task_def: EvalTaskDef, eval_task_config: EvalTaskConfig, ) -> Job: - pass + # NOTE: We need to use DEFAULT_EVAL_TASK_IDENTIFIER to make the router work for all app evals + return await self.routing_table.get_provider_impl( + DEFAULT_EVAL_TASK_IDENTIFIER + ).run_eval( + eval_task_def=eval_task_def, + eval_task_config=eval_task_config, + ) @webmethod(route="/eval/evaluate_rows", method="POST") async def evaluate_rows( @@ -289,13 +299,27 @@ class EvalRouter(Eval): scoring_functions: List[str], eval_task_config: EvalTaskConfig, # type: ignore ) -> EvaluateResponse: - pass + # NOTE: This is to deal with the case where we do not pre-register an eval benchmark_task + # We use default DEFAULT_EVAL_TASK_IDENTIFIER as identifier + return await self.routing_table.get_provider_impl( + DEFAULT_EVAL_TASK_IDENTIFIER + ).evaluate_rows( + input_rows=input_rows, + scoring_functions=scoring_functions, + eval_task_config=eval_task_config, + ) async def job_status(self, job_id: str) -> Optional[JobStatus]: - pass + return await self.routing_table.get_provider_impl( + DEFAULT_EVAL_TASK_IDENTIFIER + ).job_status(job_id) async def job_cancel(self, job_id: str) -> None: - pass + await self.routing_table.get_provider_impl( + DEFAULT_EVAL_TASK_IDENTIFIER + ).job_cancel(job_id) async def job_result(self, job_id: str) -> EvaluateResponse: - pass + return await self.routing_table.get_provider_impl( + DEFAULT_EVAL_TASK_IDENTIFIER + ).job_result(job_id) diff --git a/llama_stack/providers/inline/meta_reference/eval/eval.py b/llama_stack/providers/inline/meta_reference/eval/eval.py index 28420ee35..38c5869c2 100644 --- a/llama_stack/providers/inline/meta_reference/eval/eval.py +++ b/llama_stack/providers/inline/meta_reference/eval/eval.py @@ -11,7 +11,7 @@ from .....apis.eval.eval import BenchmarkEvalTaskConfig from llama_stack.apis.common.type_system import * # noqa: F403 from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval import * # noqa: F403 +from llama_stack.apis.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus from llama_stack.apis.eval_tasks import EvalTaskDef from llama_stack.apis.inference import Inference from llama_stack.apis.scoring import Scoring @@ -19,6 +19,8 @@ from llama_stack.providers.datatypes import EvalTasksProtocolPrivate from .config import MetaReferenceEvalConfig +DEFAULT_EVAL_TASK_IDENTIFIER = "meta-reference::app_eval" + class ColumnName(Enum): input_query = "input_query" @@ -50,9 +52,18 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): async def shutdown(self) -> None: ... - async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]: - print("HHHH") - return [] + async def list_eval_tasks(self) -> List[EvalTaskDef]: + # NOTE: In order to be routed to this provider, the eval task def must have + # a EvalTaskDef with identifier defined as DEFAULT_EVAL_TASK_IDENTIFIER + # for app eval where eval task benchmark_id is not pre-registered + eval_tasks = [ + EvalTaskDef( + identifier=DEFAULT_EVAL_TASK_IDENTIFIER, + dataset_id="", + scoring_functions=[], + ) + ] + return eval_tasks async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id) @@ -98,10 +109,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): dataset_id=dataset_id, rows_in_page=-1, ) - res = await self.evaluate( + res = await self.evaluate_rows( input_rows=all_rows.rows, - candidate=candidate, scoring_functions=scoring_functions, + eval_task_config=eval_task_config, ) # TODO: currently needs to wait for generation before returning @@ -140,7 +151,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): } ) elif ColumnName.chat_completion_input.value in x: - input_messages = eval(str(x[ColumnName.chat_completion_input.value])) + chat_completion_input_str = str( + x[ColumnName.chat_completion_input.value] + ) + input_messages = eval(chat_completion_input_str) input_messages = [UserMessage(**x) for x in input_messages] messages = [] if candidate.system_message: diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index cc14ccd1d..6fe6c07fb 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -7,6 +7,12 @@ import pytest +from llama_models.llama3.api import SamplingParams + +from llama_stack.apis.eval.eval import AppEvalTaskConfig, EvalTaskDef, ModelCandidate +from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset + + # How to run this test: # # pytest llama_stack/providers/tests/eval/test_eval.py @@ -22,4 +28,73 @@ class Testeval: _, eval_tasks_impl, _, _, _, _ = eval_stack response = await eval_tasks_impl.list_eval_tasks() assert isinstance(response, list) - print(response) + assert len(response) >= 1 + + @pytest.mark.asyncio + async def test_eval_evaluate_rows(self, eval_stack): + eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack + await register_dataset( + datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval" + ) + response = await datasets_impl.list_datasets() + assert len(response) == 1 + rows = await datasetio_impl.get_rows_paginated( + dataset_id="test_dataset_for_eval", + rows_in_page=3, + ) + assert len(rows.rows) == 3 + + scoring_functions = [ + "meta-reference::llm_as_judge_8b_correctness", + "meta-reference::equality", + ] + + response = await eval_impl.evaluate_rows( + input_rows=rows.rows, + scoring_functions=scoring_functions, + eval_task_config=AppEvalTaskConfig( + eval_candidate=ModelCandidate( + model="Llama3.2-3B-Instruct", + sampling_params=SamplingParams(), + ), + ), + ) + assert len(response.generations) == 3 + assert "meta-reference::llm_as_judge_8b_correctness" in response.scores + assert "meta-reference::equality" in response.scores + + @pytest.mark.asyncio + async def test_eval_run_eval(self, eval_stack): + eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack + await register_dataset( + datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval" + ) + + scoring_functions = [ + "meta-reference::llm_as_judge_8b_correctness", + "meta-reference::subset_of", + ] + + response = await eval_impl.run_eval( + eval_task_def=EvalTaskDef( + # NOTE: this is needed to make the router work for all app evals + identifier="meta-reference::app_eval", + dataset_id="test_dataset_for_eval", + scoring_functions=scoring_functions, + ), + eval_task_config=AppEvalTaskConfig( + eval_candidate=ModelCandidate( + model="Llama3.2-3B-Instruct", + sampling_params=SamplingParams(), + ), + ), + ) + assert response.job_id == "0" + job_status = await eval_impl.job_status(response.job_id) + assert job_status and job_status.value == "completed" + eval_response = await eval_impl.job_result(response.job_id) + + assert eval_response is not None + assert len(eval_response.generations) == 5 + assert "meta-reference::subset_of" in eval_response.scores + assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores