This commit is contained in:
Xi Yan 2024-11-18 11:42:02 -08:00
parent 0d8de1c768
commit ba5d755848
3 changed files with 5 additions and 5 deletions

View file

@ -133,7 +133,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
candidate = task_config.eval_candidate candidate = task_config.eval_candidate
create_response = await self.agent_api.create_agent(candidate.config) create_response = await self.agents_api.create_agent(candidate.config)
agent_id = create_response.agent_id agent_id = create_response.agent_id
generations = [] generations = []
@ -143,7 +143,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
input_messages = [UserMessage(**x) for x in input_messages] input_messages = [UserMessage(**x) for x in input_messages]
# NOTE: only single-turn agent generation is supported. Create a new session for each input row # NOTE: only single-turn agent generation is supported. Create a new session for each input row
session_create_response = await self.agent_api.create_agent_session( session_create_response = await self.agents_api.create_agent_session(
agent_id, f"session-{i}" agent_id, f"session-{i}"
) )
session_id = session_create_response.session_id session_id = session_create_response.session_id
@ -156,7 +156,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
) )
turn_response = [ turn_response = [
chunk chunk
async for chunk in await self.agent_api.create_agent_turn( async for chunk in await self.agents_api.create_agent_turn(
**turn_request **turn_request
) )
] ]

View file

@ -78,7 +78,7 @@ Just return the letters "A", "B", or "C", with no text around it.
llm_as_judge_405b_simpleqa = ScoringFn( llm_as_judge_405b_simpleqa = ScoringFn(
identifier="llm-as-judge::llm_as_judge_405b_simpleqa", identifier="llm-as-judge::405b-simpleqa",
description="Llm As Judge Scoring Function for SimpleQA Benchmark (https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py)", description="Llm As Judge Scoring Function for SimpleQA Benchmark (https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py)",
return_type=NumberType(), return_type=NumberType(),
provider_id="llm-as-judge", provider_id="llm-as-judge",

View file

@ -9,7 +9,7 @@ from llama_stack.apis.scoring_functions import ScoringFn
llm_as_judge_base = ScoringFn( llm_as_judge_base = ScoringFn(
identifier="llm-as-judge::llm_as_judge_base", identifier="llm-as-judge::base",
description="Llm As Judge Scoring Function", description="Llm As Judge Scoring Function",
return_type=NumberType(), return_type=NumberType(),
provider_id="llm-as-judge", provider_id="llm-as-judge",