mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
[Agentic Eval] add ability to run agents generation (#469)
# What does this PR do? - add ability to run agents generation for full eval (generate + scoring) - pre-register SimpleQA benchmark llm-as-judge scoring function in code ## Test Plan   #### Simple QA w/ Search  - eval_task_config_simpleqa_search.json ```json { "type": "benchmark", "eval_candidate": { "type": "agent", "config": { "model": "Llama3.1-405B-Instruct", "instructions": "Please use the search tool to answer the question.", "sampling_params": { "strategy": "greedy", "temperature": 1.0, "top_p": 0.9 }, "tools": [ { "type": "brave_search", "engine": "brave", "api_key": "API_KEY" } ], "tool_choice": "auto", "tool_prompt_format": "json", "input_shields": [], "output_shields": [], "enable_session_persistence": false } } } ``` #### SimpleQA w/o Search  ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
This commit is contained in:
parent
f1b9578f8d
commit
0784284ab5
6 changed files with 159 additions and 11 deletions
|
@ -9,6 +9,7 @@ from llama_models.llama3.api.datatypes import * # noqa: F403
|
|||
from .....apis.common.job_types import Job
|
||||
from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
|
||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||
from llama_stack.apis.agents import Agents
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval_tasks import EvalTask
|
||||
|
@ -39,12 +40,14 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
datasets_api: Datasets,
|
||||
scoring_api: Scoring,
|
||||
inference_api: Inference,
|
||||
agents_api: Agents,
|
||||
) -> None:
|
||||
self.config = config
|
||||
self.datasetio_api = datasetio_api
|
||||
self.datasets_api = datasets_api
|
||||
self.scoring_api = scoring_api
|
||||
self.inference_api = inference_api
|
||||
self.agents_api = agents_api
|
||||
|
||||
# TODO: assume sync job, will need jobs API for async scheduling
|
||||
self.jobs = {}
|
||||
|
@ -126,18 +129,50 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
self.jobs[job_id] = res
|
||||
return Job(job_id=job_id)
|
||||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
) -> EvaluateResponse:
|
||||
async def _run_agent_generation(
|
||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
||||
) -> List[Dict[str, Any]]:
|
||||
candidate = task_config.eval_candidate
|
||||
if candidate.type == "agent":
|
||||
raise NotImplementedError(
|
||||
"Evaluation with generation has not been implemented for agents"
|
||||
create_response = await self.agents_api.create_agent(candidate.config)
|
||||
agent_id = create_response.agent_id
|
||||
|
||||
generations = []
|
||||
for i, x in tqdm(enumerate(input_rows)):
|
||||
assert ColumnName.chat_completion_input.value in x, "Invalid input row"
|
||||
input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
|
||||
input_messages = [UserMessage(**x) for x in input_messages]
|
||||
|
||||
# NOTE: only single-turn agent generation is supported. Create a new session for each input row
|
||||
session_create_response = await self.agents_api.create_agent_session(
|
||||
agent_id, f"session-{i}"
|
||||
)
|
||||
session_id = session_create_response.session_id
|
||||
|
||||
turn_request = dict(
|
||||
agent_id=agent_id,
|
||||
session_id=session_id,
|
||||
messages=input_messages,
|
||||
stream=True,
|
||||
)
|
||||
turn_response = [
|
||||
chunk
|
||||
async for chunk in await self.agents_api.create_agent_turn(
|
||||
**turn_request
|
||||
)
|
||||
]
|
||||
final_event = turn_response[-1].event.payload
|
||||
generations.append(
|
||||
{
|
||||
ColumnName.generated_answer.value: final_event.turn.output_message.content
|
||||
}
|
||||
)
|
||||
|
||||
return generations
|
||||
|
||||
async def _run_model_generation(
|
||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
||||
) -> List[Dict[str, Any]]:
|
||||
candidate = task_config.eval_candidate
|
||||
assert (
|
||||
candidate.sampling_params.max_tokens is not None
|
||||
), "SamplingParams.max_tokens must be provided"
|
||||
|
@ -179,6 +214,23 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
else:
|
||||
raise ValueError("Invalid input row")
|
||||
|
||||
return generations
|
||||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
) -> EvaluateResponse:
|
||||
candidate = task_config.eval_candidate
|
||||
if candidate.type == "agent":
|
||||
generations = await self._run_agent_generation(input_rows, task_config)
|
||||
elif candidate.type == "model":
|
||||
generations = await self._run_model_generation(input_rows, task_config)
|
||||
else:
|
||||
raise ValueError(f"Invalid candidate type: {candidate.type}")
|
||||
|
||||
# scoring with generated_answer
|
||||
score_input_rows = [
|
||||
input_r | generated_r
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue