mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
migrate evals to resource (#421)
* migrate evals to resource * remove listing of providers's evals * change the order of params in register * fix after rebase * linter fix --------- Co-authored-by: Dinesh Yeduguru <dineshyv@fb.com>
This commit is contained in:
parent
b95cb5308f
commit
3802edfc50
5 changed files with 63 additions and 56 deletions
|
@ -11,12 +11,9 @@ from llama_models.llama3.api import SamplingParams, URL
|
|||
|
||||
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
|
||||
|
||||
from llama_stack.apis.datasetio.datasetio import DatasetDefWithProvider
|
||||
|
||||
from llama_stack.apis.eval.eval import (
|
||||
AppEvalTaskConfig,
|
||||
BenchmarkEvalTaskConfig,
|
||||
EvalTaskDefWithProvider,
|
||||
ModelCandidate,
|
||||
)
|
||||
from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams
|
||||
|
@ -70,13 +67,11 @@ class Testeval:
|
|||
"meta-reference::equality",
|
||||
]
|
||||
task_id = "meta-reference::app_eval"
|
||||
task_def = EvalTaskDefWithProvider(
|
||||
identifier=task_id,
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id=task_id,
|
||||
dataset_id="test_dataset_for_eval",
|
||||
scoring_functions=scoring_functions,
|
||||
provider_id="meta-reference",
|
||||
)
|
||||
await eval_tasks_impl.register_eval_task(task_def)
|
||||
response = await eval_impl.evaluate_rows(
|
||||
task_id=task_id,
|
||||
input_rows=rows.rows,
|
||||
|
@ -125,13 +120,11 @@ class Testeval:
|
|||
]
|
||||
|
||||
task_id = "meta-reference::app_eval-2"
|
||||
task_def = EvalTaskDefWithProvider(
|
||||
identifier=task_id,
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id=task_id,
|
||||
dataset_id="test_dataset_for_eval",
|
||||
scoring_functions=scoring_functions,
|
||||
provider_id="meta-reference",
|
||||
)
|
||||
await eval_tasks_impl.register_eval_task(task_def)
|
||||
response = await eval_impl.run_eval(
|
||||
task_id=task_id,
|
||||
task_config=AppEvalTaskConfig(
|
||||
|
@ -169,35 +162,29 @@ class Testeval:
|
|||
pytest.skip(
|
||||
"Only huggingface provider supports pre-registered remote datasets"
|
||||
)
|
||||
# register dataset
|
||||
mmlu = DatasetDefWithProvider(
|
||||
identifier="mmlu",
|
||||
url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
|
||||
dataset_schema={
|
||||
|
||||
await datasets_impl.register_dataset(
|
||||
dataset_id="mmlu",
|
||||
schema={
|
||||
"input_query": StringType(),
|
||||
"expected_answer": StringType(),
|
||||
"chat_completion_input": ChatCompletionInputType(),
|
||||
},
|
||||
url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
|
||||
metadata={
|
||||
"path": "llamastack/evals",
|
||||
"name": "evals__mmlu__details",
|
||||
"split": "train",
|
||||
},
|
||||
provider_id="",
|
||||
)
|
||||
|
||||
await datasets_impl.register_dataset(mmlu)
|
||||
|
||||
# register eval task
|
||||
meta_reference_mmlu = EvalTaskDefWithProvider(
|
||||
identifier="meta-reference-mmlu",
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id="meta-reference-mmlu",
|
||||
dataset_id="mmlu",
|
||||
scoring_functions=["meta-reference::regex_parser_multiple_choice_answer"],
|
||||
provider_id="",
|
||||
)
|
||||
|
||||
await eval_tasks_impl.register_eval_task(meta_reference_mmlu)
|
||||
|
||||
# list benchmarks
|
||||
response = await eval_tasks_impl.list_eval_tasks()
|
||||
assert len(response) > 0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue