[Agentic Eval] add ability to run agents generation (#469)

# What does this PR do? - add ability to run agents generation for full eval (generate + scoring) - pre-register SimpleQA benchmark llm-as-judge scoring function in code ## Test Plan ![image](https://github.com/user-attachments/assets/b4b6f086-1be4-4c2a-8ab0-6839f0067c0a) ![image](https://github.com/user-attachments/assets/05bb7a09-2d7a-4031-8eb6-e1ca670ee439) #### Simple QA w/ Search ![image](https://github.com/user-attachments/assets/0a51e3f3-9fc7-479b-8295-89aed63496e0) - eval_task_config_simpleqa_search.json ```json { "type": "benchmark", "eval_candidate": { "type": "agent", "config": { "model": "Llama3.1-405B-Instruct", "instructions": "Please use the search tool to answer the question.", "sampling_params": { "strategy": "greedy", "temperature": 1.0, "top_p": 0.9 }, "tools": [ { "type": "brave_search", "engine": "brave", "api_key": "API_KEY" } ], "tool_choice": "auto", "tool_prompt_format": "json", "input_shields": [], "output_shields": [], "enable_session_persistence": false } } } ``` #### SimpleQA w/o Search ![image](https://github.com/user-attachments/assets/6301feef-2abb-4bee-b50c-97da1c90482b) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
2025-12-04 02:03:44 +00:00 · 2024-11-18 11:43:03 -08:00 · 2024-11-18 11:43:03 -08:00 · 0784284ab5
commit 0784284ab5
parent f1b9578f8d
6 changed files with 159 additions and 11 deletions
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -9,6 +9,7 @@ from llama_models.llama3.api.datatypes import *  # noqa: F403
 from .....apis.common.job_types import Job
 from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
 from llama_stack.apis.common.type_system import *  # noqa: F403
+from llama_stack.apis.agents import Agents
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval_tasks import EvalTask
@ -39,12 +40,14 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
        datasets_api: Datasets,
        scoring_api: Scoring,
        inference_api: Inference,
+        agents_api: Agents,
    ) -> None:
        self.config = config
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
        self.scoring_api = scoring_api
        self.inference_api = inference_api
+        self.agents_api = agents_api

        # TODO: assume sync job, will need jobs API for async scheduling
        self.jobs = {}
@ -126,18 +129,50 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
        self.jobs[job_id] = res
        return Job(job_id=job_id)

-    async def evaluate_rows(
-        self,
-        task_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
-        task_config: EvalTaskConfig,
-    ) -> EvaluateResponse:
+    async def _run_agent_generation(
+        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+    ) -> List[Dict[str, Any]]:
        candidate = task_config.eval_candidate
-        if candidate.type == "agent":
-            raise NotImplementedError(
-                "Evaluation with generation has not been implemented for agents"
+        create_response = await self.agents_api.create_agent(candidate.config)
+        agent_id = create_response.agent_id
+
+        generations = []
+        for i, x in tqdm(enumerate(input_rows)):
+            assert ColumnName.chat_completion_input.value in x, "Invalid input row"
+            input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
+            input_messages = [UserMessage(**x) for x in input_messages]
+
+            # NOTE: only single-turn agent generation is supported. Create a new session for each input row
+            session_create_response = await self.agents_api.create_agent_session(
+                agent_id, f"session-{i}"
            )
+            session_id = session_create_response.session_id
+
+            turn_request = dict(
+                agent_id=agent_id,
+                session_id=session_id,
+                messages=input_messages,
+                stream=True,
+            )
+            turn_response = [
+                chunk
+                async for chunk in await self.agents_api.create_agent_turn(
+                    **turn_request
+                )
+            ]
+            final_event = turn_response[-1].event.payload
+            generations.append(
+                {
+                    ColumnName.generated_answer.value: final_event.turn.output_message.content
+                }
+            )
+
+        return generations
+
+    async def _run_model_generation(
+        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+    ) -> List[Dict[str, Any]]:
+        candidate = task_config.eval_candidate
        assert (
            candidate.sampling_params.max_tokens is not None
        ), "SamplingParams.max_tokens must be provided"
@ -179,6 +214,23 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
            else:
                raise ValueError("Invalid input row")

+        return generations
+
+    async def evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: EvalTaskConfig,
+    ) -> EvaluateResponse:
+        candidate = task_config.eval_candidate
+        if candidate.type == "agent":
+            generations = await self._run_agent_generation(input_rows, task_config)
+        elif candidate.type == "model":
+            generations = await self._run_model_generation(input_rows, task_config)
+        else:
+            raise ValueError(f"Invalid candidate type: {candidate.type}")
+
        # scoring with generated_answer
        score_input_rows = [
            input_r | generated_r