mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 00:34:44 +00:00
test eval works
This commit is contained in:
parent
413a1b6d8b
commit
3f1ac29d57
3 changed files with 126 additions and 13 deletions
|
@ -16,6 +16,10 @@ from llama_stack.apis.datasetio import * # noqa: F403
|
|||
from llama_stack.apis.scoring import * # noqa: F403
|
||||
from llama_stack.apis.eval import * # noqa: F403
|
||||
|
||||
from llama_stack.providers.inline.meta_reference.eval.eval import (
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER,
|
||||
)
|
||||
|
||||
|
||||
class MemoryRouter(Memory):
|
||||
"""Routes to an provider based on the memory bank identifier"""
|
||||
|
@ -280,7 +284,13 @@ class EvalRouter(Eval):
|
|||
eval_task_def: EvalTaskDef,
|
||||
eval_task_config: EvalTaskConfig,
|
||||
) -> Job:
|
||||
pass
|
||||
# NOTE: We need to use DEFAULT_EVAL_TASK_IDENTIFIER to make the router work for all app evals
|
||||
return await self.routing_table.get_provider_impl(
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
).run_eval(
|
||||
eval_task_def=eval_task_def,
|
||||
eval_task_config=eval_task_config,
|
||||
)
|
||||
|
||||
@webmethod(route="/eval/evaluate_rows", method="POST")
|
||||
async def evaluate_rows(
|
||||
|
@ -289,13 +299,27 @@ class EvalRouter(Eval):
|
|||
scoring_functions: List[str],
|
||||
eval_task_config: EvalTaskConfig, # type: ignore
|
||||
) -> EvaluateResponse:
|
||||
pass
|
||||
# NOTE: This is to deal with the case where we do not pre-register an eval benchmark_task
|
||||
# We use default DEFAULT_EVAL_TASK_IDENTIFIER as identifier
|
||||
return await self.routing_table.get_provider_impl(
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
).evaluate_rows(
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
eval_task_config=eval_task_config,
|
||||
)
|
||||
|
||||
async def job_status(self, job_id: str) -> Optional[JobStatus]:
|
||||
pass
|
||||
return await self.routing_table.get_provider_impl(
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
).job_status(job_id)
|
||||
|
||||
async def job_cancel(self, job_id: str) -> None:
|
||||
pass
|
||||
await self.routing_table.get_provider_impl(
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
).job_cancel(job_id)
|
||||
|
||||
async def job_result(self, job_id: str) -> EvaluateResponse:
|
||||
pass
|
||||
return await self.routing_table.get_provider_impl(
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
).job_result(job_id)
|
||||
|
|
|
@ -11,7 +11,7 @@ from .....apis.eval.eval import BenchmarkEvalTaskConfig
|
|||
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval import * # noqa: F403
|
||||
from llama_stack.apis.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
|
||||
from llama_stack.apis.eval_tasks import EvalTaskDef
|
||||
from llama_stack.apis.inference import Inference
|
||||
from llama_stack.apis.scoring import Scoring
|
||||
|
@ -19,6 +19,8 @@ from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
|
|||
|
||||
from .config import MetaReferenceEvalConfig
|
||||
|
||||
DEFAULT_EVAL_TASK_IDENTIFIER = "meta-reference::app_eval"
|
||||
|
||||
|
||||
class ColumnName(Enum):
|
||||
input_query = "input_query"
|
||||
|
@ -50,9 +52,18 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
|
||||
async def shutdown(self) -> None: ...
|
||||
|
||||
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]:
|
||||
print("HHHH")
|
||||
return []
|
||||
async def list_eval_tasks(self) -> List[EvalTaskDef]:
|
||||
# NOTE: In order to be routed to this provider, the eval task def must have
|
||||
# a EvalTaskDef with identifier defined as DEFAULT_EVAL_TASK_IDENTIFIER
|
||||
# for app eval where eval task benchmark_id is not pre-registered
|
||||
eval_tasks = [
|
||||
EvalTaskDef(
|
||||
identifier=DEFAULT_EVAL_TASK_IDENTIFIER,
|
||||
dataset_id="",
|
||||
scoring_functions=[],
|
||||
)
|
||||
]
|
||||
return eval_tasks
|
||||
|
||||
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
||||
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
||||
|
@ -98,10 +109,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
dataset_id=dataset_id,
|
||||
rows_in_page=-1,
|
||||
)
|
||||
res = await self.evaluate(
|
||||
res = await self.evaluate_rows(
|
||||
input_rows=all_rows.rows,
|
||||
candidate=candidate,
|
||||
scoring_functions=scoring_functions,
|
||||
eval_task_config=eval_task_config,
|
||||
)
|
||||
|
||||
# TODO: currently needs to wait for generation before returning
|
||||
|
@ -140,7 +151,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
|||
}
|
||||
)
|
||||
elif ColumnName.chat_completion_input.value in x:
|
||||
input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
|
||||
chat_completion_input_str = str(
|
||||
x[ColumnName.chat_completion_input.value]
|
||||
)
|
||||
input_messages = eval(chat_completion_input_str)
|
||||
input_messages = [UserMessage(**x) for x in input_messages]
|
||||
messages = []
|
||||
if candidate.system_message:
|
||||
|
|
|
@ -7,6 +7,12 @@
|
|||
|
||||
import pytest
|
||||
|
||||
from llama_models.llama3.api import SamplingParams
|
||||
|
||||
from llama_stack.apis.eval.eval import AppEvalTaskConfig, EvalTaskDef, ModelCandidate
|
||||
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
|
||||
|
||||
|
||||
# How to run this test:
|
||||
#
|
||||
# pytest llama_stack/providers/tests/eval/test_eval.py
|
||||
|
@ -22,4 +28,73 @@ class Testeval:
|
|||
_, eval_tasks_impl, _, _, _, _ = eval_stack
|
||||
response = await eval_tasks_impl.list_eval_tasks()
|
||||
assert isinstance(response, list)
|
||||
print(response)
|
||||
assert len(response) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_evaluate_rows(self, eval_stack):
|
||||
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
|
||||
await register_dataset(
|
||||
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
|
||||
)
|
||||
response = await datasets_impl.list_datasets()
|
||||
assert len(response) == 1
|
||||
rows = await datasetio_impl.get_rows_paginated(
|
||||
dataset_id="test_dataset_for_eval",
|
||||
rows_in_page=3,
|
||||
)
|
||||
assert len(rows.rows) == 3
|
||||
|
||||
scoring_functions = [
|
||||
"meta-reference::llm_as_judge_8b_correctness",
|
||||
"meta-reference::equality",
|
||||
]
|
||||
|
||||
response = await eval_impl.evaluate_rows(
|
||||
input_rows=rows.rows,
|
||||
scoring_functions=scoring_functions,
|
||||
eval_task_config=AppEvalTaskConfig(
|
||||
eval_candidate=ModelCandidate(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
sampling_params=SamplingParams(),
|
||||
),
|
||||
),
|
||||
)
|
||||
assert len(response.generations) == 3
|
||||
assert "meta-reference::llm_as_judge_8b_correctness" in response.scores
|
||||
assert "meta-reference::equality" in response.scores
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_run_eval(self, eval_stack):
|
||||
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
|
||||
await register_dataset(
|
||||
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
|
||||
)
|
||||
|
||||
scoring_functions = [
|
||||
"meta-reference::llm_as_judge_8b_correctness",
|
||||
"meta-reference::subset_of",
|
||||
]
|
||||
|
||||
response = await eval_impl.run_eval(
|
||||
eval_task_def=EvalTaskDef(
|
||||
# NOTE: this is needed to make the router work for all app evals
|
||||
identifier="meta-reference::app_eval",
|
||||
dataset_id="test_dataset_for_eval",
|
||||
scoring_functions=scoring_functions,
|
||||
),
|
||||
eval_task_config=AppEvalTaskConfig(
|
||||
eval_candidate=ModelCandidate(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
sampling_params=SamplingParams(),
|
||||
),
|
||||
),
|
||||
)
|
||||
assert response.job_id == "0"
|
||||
job_status = await eval_impl.job_status(response.job_id)
|
||||
assert job_status and job_status.value == "completed"
|
||||
eval_response = await eval_impl.job_result(response.job_id)
|
||||
|
||||
assert eval_response is not None
|
||||
assert len(eval_response.generations) == 5
|
||||
assert "meta-reference::subset_of" in eval_response.scores
|
||||
assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue