From acd0b0f3f6db25f72993a578b078a696e3ea0c56 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 12 Nov 2024 11:49:48 -0500 Subject: [PATCH] fix eval tests --- llama_stack/providers/tests/eval/conftest.py | 10 +++++----- llama_stack/providers/tests/eval/fixtures.py | 2 +- llama_stack/providers/tests/eval/test_eval.py | 12 +++++------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py index 985a8bc37..caf7f0290 100644 --- a/llama_stack/providers/tests/eval/conftest.py +++ b/llama_stack/providers/tests/eval/conftest.py @@ -17,8 +17,8 @@ DEFAULT_PROVIDER_COMBINATIONS = [ pytest.param( { "eval": "meta_reference", - "scoring": "meta_reference", - "datasetio": "meta_reference", + "scoring": "basic", + "datasetio": "localfs", "inference": "fireworks", }, id="meta_reference_eval_fireworks_inference", @@ -27,8 +27,8 @@ DEFAULT_PROVIDER_COMBINATIONS = [ pytest.param( { "eval": "meta_reference", - "scoring": "meta_reference", - "datasetio": "meta_reference", + "scoring": "basic", + "datasetio": "localfs", "inference": "together", }, id="meta_reference_eval_together_inference", @@ -37,7 +37,7 @@ DEFAULT_PROVIDER_COMBINATIONS = [ pytest.param( { "eval": "meta_reference", - "scoring": "meta_reference", + "scoring": "basic", "datasetio": "huggingface", "inference": "together", }, diff --git a/llama_stack/providers/tests/eval/fixtures.py b/llama_stack/providers/tests/eval/fixtures.py index 810239440..4a359213b 100644 --- a/llama_stack/providers/tests/eval/fixtures.py +++ b/llama_stack/providers/tests/eval/fixtures.py @@ -24,7 +24,7 @@ def eval_meta_reference() -> ProviderFixture: providers=[ Provider( provider_id="meta-reference", - provider_type="meta-reference", + provider_type="inline::meta-reference", config={}, ) ], diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 92c4d0331..2d08aabe7 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -63,8 +63,7 @@ class Testeval: assert len(rows.rows) == 3 scoring_functions = [ - "meta-reference::llm_as_judge_base", - "meta-reference::equality", + "basic::equality", ] task_id = "meta-reference::app_eval" await eval_tasks_impl.register_eval_task( @@ -95,8 +94,7 @@ class Testeval: ), ) assert len(response.generations) == 3 - assert "meta-reference::equality" in response.scores - assert "meta-reference::llm_as_judge_base" in response.scores + assert "basic::equality" in response.scores @pytest.mark.asyncio async def test_eval_run_eval(self, eval_stack): @@ -116,7 +114,7 @@ class Testeval: ) scoring_functions = [ - "meta-reference::subset_of", + "basic::subset_of", ] task_id = "meta-reference::app_eval-2" @@ -141,7 +139,7 @@ class Testeval: assert eval_response is not None assert len(eval_response.generations) == 5 - assert "meta-reference::subset_of" in eval_response.scores + assert "basic::subset_of" in eval_response.scores @pytest.mark.asyncio async def test_eval_run_benchmark_eval(self, eval_stack): @@ -182,7 +180,7 @@ class Testeval: await eval_tasks_impl.register_eval_task( eval_task_id="meta-reference-mmlu", dataset_id="mmlu", - scoring_functions=["meta-reference::regex_parser_multiple_choice_answer"], + scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) # list benchmarks