mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-08 14:54:35 +00:00
fix eval task registration (#426)
* fix eval tasks * fix eval tasks * fix eval tests
This commit is contained in:
parent
84c6fbbd93
commit
ec4fcad5ca
4 changed files with 16 additions and 13 deletions
|
@ -17,8 +17,8 @@ DEFAULT_PROVIDER_COMBINATIONS = [
|
|||
pytest.param(
|
||||
{
|
||||
"eval": "meta_reference",
|
||||
"scoring": "meta_reference",
|
||||
"datasetio": "meta_reference",
|
||||
"scoring": "basic",
|
||||
"datasetio": "localfs",
|
||||
"inference": "fireworks",
|
||||
},
|
||||
id="meta_reference_eval_fireworks_inference",
|
||||
|
@ -27,8 +27,8 @@ DEFAULT_PROVIDER_COMBINATIONS = [
|
|||
pytest.param(
|
||||
{
|
||||
"eval": "meta_reference",
|
||||
"scoring": "meta_reference",
|
||||
"datasetio": "meta_reference",
|
||||
"scoring": "basic",
|
||||
"datasetio": "localfs",
|
||||
"inference": "together",
|
||||
},
|
||||
id="meta_reference_eval_together_inference",
|
||||
|
@ -37,7 +37,7 @@ DEFAULT_PROVIDER_COMBINATIONS = [
|
|||
pytest.param(
|
||||
{
|
||||
"eval": "meta_reference",
|
||||
"scoring": "meta_reference",
|
||||
"scoring": "basic",
|
||||
"datasetio": "huggingface",
|
||||
"inference": "together",
|
||||
},
|
||||
|
|
|
@ -24,7 +24,7 @@ def eval_meta_reference() -> ProviderFixture:
|
|||
providers=[
|
||||
Provider(
|
||||
provider_id="meta-reference",
|
||||
provider_type="meta-reference",
|
||||
provider_type="inline::meta-reference",
|
||||
config={},
|
||||
)
|
||||
],
|
||||
|
|
|
@ -63,8 +63,7 @@ class Testeval:
|
|||
assert len(rows.rows) == 3
|
||||
|
||||
scoring_functions = [
|
||||
"meta-reference::llm_as_judge_base",
|
||||
"meta-reference::equality",
|
||||
"basic::equality",
|
||||
]
|
||||
task_id = "meta-reference::app_eval"
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
|
@ -95,8 +94,7 @@ class Testeval:
|
|||
),
|
||||
)
|
||||
assert len(response.generations) == 3
|
||||
assert "meta-reference::equality" in response.scores
|
||||
assert "meta-reference::llm_as_judge_base" in response.scores
|
||||
assert "basic::equality" in response.scores
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_run_eval(self, eval_stack):
|
||||
|
@ -116,7 +114,7 @@ class Testeval:
|
|||
)
|
||||
|
||||
scoring_functions = [
|
||||
"meta-reference::subset_of",
|
||||
"basic::subset_of",
|
||||
]
|
||||
|
||||
task_id = "meta-reference::app_eval-2"
|
||||
|
@ -141,7 +139,7 @@ class Testeval:
|
|||
|
||||
assert eval_response is not None
|
||||
assert len(eval_response.generations) == 5
|
||||
assert "meta-reference::subset_of" in eval_response.scores
|
||||
assert "basic::subset_of" in eval_response.scores
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_run_benchmark_eval(self, eval_stack):
|
||||
|
@ -182,7 +180,7 @@ class Testeval:
|
|||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id="meta-reference-mmlu",
|
||||
dataset_id="mmlu",
|
||||
scoring_functions=["meta-reference::regex_parser_multiple_choice_answer"],
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
)
|
||||
|
||||
# list benchmarks
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue