test eval works

This commit is contained in:
Xi Yan 2024-11-06 21:40:38 -08:00
parent 413a1b6d8b
commit 3f1ac29d57
3 changed files with 126 additions and 13 deletions

View file

@ -16,6 +16,10 @@ from llama_stack.apis.datasetio import * # noqa: F403
from llama_stack.apis.scoring import * # noqa: F403
from llama_stack.apis.eval import * # noqa: F403
from llama_stack.providers.inline.meta_reference.eval.eval import (
DEFAULT_EVAL_TASK_IDENTIFIER,
)
class MemoryRouter(Memory):
"""Routes to an provider based on the memory bank identifier"""
@ -280,7 +284,13 @@ class EvalRouter(Eval):
eval_task_def: EvalTaskDef,
eval_task_config: EvalTaskConfig,
) -> Job:
pass
# NOTE: We need to use DEFAULT_EVAL_TASK_IDENTIFIER to make the router work for all app evals
return await self.routing_table.get_provider_impl(
DEFAULT_EVAL_TASK_IDENTIFIER
).run_eval(
eval_task_def=eval_task_def,
eval_task_config=eval_task_config,
)
@webmethod(route="/eval/evaluate_rows", method="POST")
async def evaluate_rows(
@ -289,13 +299,27 @@ class EvalRouter(Eval):
scoring_functions: List[str],
eval_task_config: EvalTaskConfig, # type: ignore
) -> EvaluateResponse:
pass
# NOTE: This is to deal with the case where we do not pre-register an eval benchmark_task
# We use default DEFAULT_EVAL_TASK_IDENTIFIER as identifier
return await self.routing_table.get_provider_impl(
DEFAULT_EVAL_TASK_IDENTIFIER
).evaluate_rows(
input_rows=input_rows,
scoring_functions=scoring_functions,
eval_task_config=eval_task_config,
)
async def job_status(self, job_id: str) -> Optional[JobStatus]:
pass
return await self.routing_table.get_provider_impl(
DEFAULT_EVAL_TASK_IDENTIFIER
).job_status(job_id)
async def job_cancel(self, job_id: str) -> None:
pass
await self.routing_table.get_provider_impl(
DEFAULT_EVAL_TASK_IDENTIFIER
).job_cancel(job_id)
async def job_result(self, job_id: str) -> EvaluateResponse:
pass
return await self.routing_table.get_provider_impl(
DEFAULT_EVAL_TASK_IDENTIFIER
).job_result(job_id)

View file

@ -11,7 +11,7 @@ from .....apis.eval.eval import BenchmarkEvalTaskConfig
from llama_stack.apis.common.type_system import * # noqa: F403
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import * # noqa: F403
from llama_stack.apis.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
from llama_stack.apis.eval_tasks import EvalTaskDef
from llama_stack.apis.inference import Inference
from llama_stack.apis.scoring import Scoring
@ -19,6 +19,8 @@ from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
from .config import MetaReferenceEvalConfig
DEFAULT_EVAL_TASK_IDENTIFIER = "meta-reference::app_eval"
class ColumnName(Enum):
input_query = "input_query"
@ -50,9 +52,18 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
async def shutdown(self) -> None: ...
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]:
print("HHHH")
return []
async def list_eval_tasks(self) -> List[EvalTaskDef]:
# NOTE: In order to be routed to this provider, the eval task def must have
# a EvalTaskDef with identifier defined as DEFAULT_EVAL_TASK_IDENTIFIER
# for app eval where eval task benchmark_id is not pre-registered
eval_tasks = [
EvalTaskDef(
identifier=DEFAULT_EVAL_TASK_IDENTIFIER,
dataset_id="",
scoring_functions=[],
)
]
return eval_tasks
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
@ -98,10 +109,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
dataset_id=dataset_id,
rows_in_page=-1,
)
res = await self.evaluate(
res = await self.evaluate_rows(
input_rows=all_rows.rows,
candidate=candidate,
scoring_functions=scoring_functions,
eval_task_config=eval_task_config,
)
# TODO: currently needs to wait for generation before returning
@ -140,7 +151,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
}
)
elif ColumnName.chat_completion_input.value in x:
input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
chat_completion_input_str = str(
x[ColumnName.chat_completion_input.value]
)
input_messages = eval(chat_completion_input_str)
input_messages = [UserMessage(**x) for x in input_messages]
messages = []
if candidate.system_message:

View file

@ -7,6 +7,12 @@
import pytest
from llama_models.llama3.api import SamplingParams
from llama_stack.apis.eval.eval import AppEvalTaskConfig, EvalTaskDef, ModelCandidate
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
# How to run this test:
#
# pytest llama_stack/providers/tests/eval/test_eval.py
@ -22,4 +28,73 @@ class Testeval:
_, eval_tasks_impl, _, _, _, _ = eval_stack
response = await eval_tasks_impl.list_eval_tasks()
assert isinstance(response, list)
print(response)
assert len(response) >= 1
@pytest.mark.asyncio
async def test_eval_evaluate_rows(self, eval_stack):
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
await register_dataset(
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
)
response = await datasets_impl.list_datasets()
assert len(response) == 1
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset_for_eval",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_functions = [
"meta-reference::llm_as_judge_8b_correctness",
"meta-reference::equality",
]
response = await eval_impl.evaluate_rows(
input_rows=rows.rows,
scoring_functions=scoring_functions,
eval_task_config=AppEvalTaskConfig(
eval_candidate=ModelCandidate(
model="Llama3.2-3B-Instruct",
sampling_params=SamplingParams(),
),
),
)
assert len(response.generations) == 3
assert "meta-reference::llm_as_judge_8b_correctness" in response.scores
assert "meta-reference::equality" in response.scores
@pytest.mark.asyncio
async def test_eval_run_eval(self, eval_stack):
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
await register_dataset(
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
)
scoring_functions = [
"meta-reference::llm_as_judge_8b_correctness",
"meta-reference::subset_of",
]
response = await eval_impl.run_eval(
eval_task_def=EvalTaskDef(
# NOTE: this is needed to make the router work for all app evals
identifier="meta-reference::app_eval",
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
),
eval_task_config=AppEvalTaskConfig(
eval_candidate=ModelCandidate(
model="Llama3.2-3B-Instruct",
sampling_params=SamplingParams(),
),
),
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(response.job_id)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 5
assert "meta-reference::subset_of" in eval_response.scores
assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores