mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 08:44:44 +00:00
test eval works
This commit is contained in:
parent
413a1b6d8b
commit
3f1ac29d57
3 changed files with 126 additions and 13 deletions
|
@ -16,6 +16,10 @@ from llama_stack.apis.datasetio import * # noqa: F403
|
||||||
from llama_stack.apis.scoring import * # noqa: F403
|
from llama_stack.apis.scoring import * # noqa: F403
|
||||||
from llama_stack.apis.eval import * # noqa: F403
|
from llama_stack.apis.eval import * # noqa: F403
|
||||||
|
|
||||||
|
from llama_stack.providers.inline.meta_reference.eval.eval import (
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MemoryRouter(Memory):
|
class MemoryRouter(Memory):
|
||||||
"""Routes to an provider based on the memory bank identifier"""
|
"""Routes to an provider based on the memory bank identifier"""
|
||||||
|
@ -280,7 +284,13 @@ class EvalRouter(Eval):
|
||||||
eval_task_def: EvalTaskDef,
|
eval_task_def: EvalTaskDef,
|
||||||
eval_task_config: EvalTaskConfig,
|
eval_task_config: EvalTaskConfig,
|
||||||
) -> Job:
|
) -> Job:
|
||||||
pass
|
# NOTE: We need to use DEFAULT_EVAL_TASK_IDENTIFIER to make the router work for all app evals
|
||||||
|
return await self.routing_table.get_provider_impl(
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
).run_eval(
|
||||||
|
eval_task_def=eval_task_def,
|
||||||
|
eval_task_config=eval_task_config,
|
||||||
|
)
|
||||||
|
|
||||||
@webmethod(route="/eval/evaluate_rows", method="POST")
|
@webmethod(route="/eval/evaluate_rows", method="POST")
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
|
@ -289,13 +299,27 @@ class EvalRouter(Eval):
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
eval_task_config: EvalTaskConfig, # type: ignore
|
eval_task_config: EvalTaskConfig, # type: ignore
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
pass
|
# NOTE: This is to deal with the case where we do not pre-register an eval benchmark_task
|
||||||
|
# We use default DEFAULT_EVAL_TASK_IDENTIFIER as identifier
|
||||||
|
return await self.routing_table.get_provider_impl(
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
).evaluate_rows(
|
||||||
|
input_rows=input_rows,
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
eval_task_config=eval_task_config,
|
||||||
|
)
|
||||||
|
|
||||||
async def job_status(self, job_id: str) -> Optional[JobStatus]:
|
async def job_status(self, job_id: str) -> Optional[JobStatus]:
|
||||||
pass
|
return await self.routing_table.get_provider_impl(
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
).job_status(job_id)
|
||||||
|
|
||||||
async def job_cancel(self, job_id: str) -> None:
|
async def job_cancel(self, job_id: str) -> None:
|
||||||
pass
|
await self.routing_table.get_provider_impl(
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
).job_cancel(job_id)
|
||||||
|
|
||||||
async def job_result(self, job_id: str) -> EvaluateResponse:
|
async def job_result(self, job_id: str) -> EvaluateResponse:
|
||||||
pass
|
return await self.routing_table.get_provider_impl(
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
).job_result(job_id)
|
||||||
|
|
|
@ -11,7 +11,7 @@ from .....apis.eval.eval import BenchmarkEvalTaskConfig
|
||||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.eval import * # noqa: F403
|
from llama_stack.apis.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
|
||||||
from llama_stack.apis.eval_tasks import EvalTaskDef
|
from llama_stack.apis.eval_tasks import EvalTaskDef
|
||||||
from llama_stack.apis.inference import Inference
|
from llama_stack.apis.inference import Inference
|
||||||
from llama_stack.apis.scoring import Scoring
|
from llama_stack.apis.scoring import Scoring
|
||||||
|
@ -19,6 +19,8 @@ from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
|
||||||
|
|
||||||
from .config import MetaReferenceEvalConfig
|
from .config import MetaReferenceEvalConfig
|
||||||
|
|
||||||
|
DEFAULT_EVAL_TASK_IDENTIFIER = "meta-reference::app_eval"
|
||||||
|
|
||||||
|
|
||||||
class ColumnName(Enum):
|
class ColumnName(Enum):
|
||||||
input_query = "input_query"
|
input_query = "input_query"
|
||||||
|
@ -50,9 +52,18 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
||||||
|
|
||||||
async def shutdown(self) -> None: ...
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]:
|
async def list_eval_tasks(self) -> List[EvalTaskDef]:
|
||||||
print("HHHH")
|
# NOTE: In order to be routed to this provider, the eval task def must have
|
||||||
return []
|
# a EvalTaskDef with identifier defined as DEFAULT_EVAL_TASK_IDENTIFIER
|
||||||
|
# for app eval where eval task benchmark_id is not pre-registered
|
||||||
|
eval_tasks = [
|
||||||
|
EvalTaskDef(
|
||||||
|
identifier=DEFAULT_EVAL_TASK_IDENTIFIER,
|
||||||
|
dataset_id="",
|
||||||
|
scoring_functions=[],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
return eval_tasks
|
||||||
|
|
||||||
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
||||||
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
||||||
|
@ -98,10 +109,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
rows_in_page=-1,
|
rows_in_page=-1,
|
||||||
)
|
)
|
||||||
res = await self.evaluate(
|
res = await self.evaluate_rows(
|
||||||
input_rows=all_rows.rows,
|
input_rows=all_rows.rows,
|
||||||
candidate=candidate,
|
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
|
eval_task_config=eval_task_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: currently needs to wait for generation before returning
|
# TODO: currently needs to wait for generation before returning
|
||||||
|
@ -140,7 +151,10 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
elif ColumnName.chat_completion_input.value in x:
|
elif ColumnName.chat_completion_input.value in x:
|
||||||
input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
|
chat_completion_input_str = str(
|
||||||
|
x[ColumnName.chat_completion_input.value]
|
||||||
|
)
|
||||||
|
input_messages = eval(chat_completion_input_str)
|
||||||
input_messages = [UserMessage(**x) for x in input_messages]
|
input_messages = [UserMessage(**x) for x in input_messages]
|
||||||
messages = []
|
messages = []
|
||||||
if candidate.system_message:
|
if candidate.system_message:
|
||||||
|
|
|
@ -7,6 +7,12 @@
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from llama_models.llama3.api import SamplingParams
|
||||||
|
|
||||||
|
from llama_stack.apis.eval.eval import AppEvalTaskConfig, EvalTaskDef, ModelCandidate
|
||||||
|
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
|
||||||
|
|
||||||
|
|
||||||
# How to run this test:
|
# How to run this test:
|
||||||
#
|
#
|
||||||
# pytest llama_stack/providers/tests/eval/test_eval.py
|
# pytest llama_stack/providers/tests/eval/test_eval.py
|
||||||
|
@ -22,4 +28,73 @@ class Testeval:
|
||||||
_, eval_tasks_impl, _, _, _, _ = eval_stack
|
_, eval_tasks_impl, _, _, _, _ = eval_stack
|
||||||
response = await eval_tasks_impl.list_eval_tasks()
|
response = await eval_tasks_impl.list_eval_tasks()
|
||||||
assert isinstance(response, list)
|
assert isinstance(response, list)
|
||||||
print(response)
|
assert len(response) >= 1
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_eval_evaluate_rows(self, eval_stack):
|
||||||
|
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
|
||||||
|
await register_dataset(
|
||||||
|
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
|
||||||
|
)
|
||||||
|
response = await datasets_impl.list_datasets()
|
||||||
|
assert len(response) == 1
|
||||||
|
rows = await datasetio_impl.get_rows_paginated(
|
||||||
|
dataset_id="test_dataset_for_eval",
|
||||||
|
rows_in_page=3,
|
||||||
|
)
|
||||||
|
assert len(rows.rows) == 3
|
||||||
|
|
||||||
|
scoring_functions = [
|
||||||
|
"meta-reference::llm_as_judge_8b_correctness",
|
||||||
|
"meta-reference::equality",
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await eval_impl.evaluate_rows(
|
||||||
|
input_rows=rows.rows,
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
eval_task_config=AppEvalTaskConfig(
|
||||||
|
eval_candidate=ModelCandidate(
|
||||||
|
model="Llama3.2-3B-Instruct",
|
||||||
|
sampling_params=SamplingParams(),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
assert len(response.generations) == 3
|
||||||
|
assert "meta-reference::llm_as_judge_8b_correctness" in response.scores
|
||||||
|
assert "meta-reference::equality" in response.scores
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_eval_run_eval(self, eval_stack):
|
||||||
|
eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack
|
||||||
|
await register_dataset(
|
||||||
|
datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval"
|
||||||
|
)
|
||||||
|
|
||||||
|
scoring_functions = [
|
||||||
|
"meta-reference::llm_as_judge_8b_correctness",
|
||||||
|
"meta-reference::subset_of",
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await eval_impl.run_eval(
|
||||||
|
eval_task_def=EvalTaskDef(
|
||||||
|
# NOTE: this is needed to make the router work for all app evals
|
||||||
|
identifier="meta-reference::app_eval",
|
||||||
|
dataset_id="test_dataset_for_eval",
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
),
|
||||||
|
eval_task_config=AppEvalTaskConfig(
|
||||||
|
eval_candidate=ModelCandidate(
|
||||||
|
model="Llama3.2-3B-Instruct",
|
||||||
|
sampling_params=SamplingParams(),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
assert response.job_id == "0"
|
||||||
|
job_status = await eval_impl.job_status(response.job_id)
|
||||||
|
assert job_status and job_status.value == "completed"
|
||||||
|
eval_response = await eval_impl.job_result(response.job_id)
|
||||||
|
|
||||||
|
assert eval_response is not None
|
||||||
|
assert len(eval_response.generations) == 5
|
||||||
|
assert "meta-reference::subset_of" in eval_response.scores
|
||||||
|
assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue