diff --git a/llama_stack/providers/inline/meta_reference/scoring/scoring.py b/llama_stack/providers/inline/meta_reference/scoring/scoring.py index 3fbd219fb..1ee617f8a 100644 --- a/llama_stack/providers/inline/meta_reference/scoring/scoring.py +++ b/llama_stack/providers/inline/meta_reference/scoring/scoring.py @@ -58,7 +58,6 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs() ] - print("!!!", scoring_fn_defs_list) for f in scoring_fn_defs_list: assert f.identifier.startswith( diff --git a/llama_stack/providers/tests/datasetio/fixtures.py b/llama_stack/providers/tests/datasetio/fixtures.py index 7d7615b55..7a389e4d1 100644 --- a/llama_stack/providers/tests/datasetio/fixtures.py +++ b/llama_stack/providers/tests/datasetio/fixtures.py @@ -31,7 +31,20 @@ def datasetio_meta_reference() -> ProviderFixture: ) -DATASETIO_FIXTURES = ["meta_reference", "remote"] +@pytest.fixture(scope="session") +def datasetio_huggingface() -> ProviderFixture: + return ProviderFixture( + providers=[ + Provider( + provider_id="huggingface", + provider_type="huggingface", + config={}, + ) + ], + ) + + +DATASETIO_FIXTURES = ["meta_reference", "remote", "huggingface"] @pytest_asyncio.fixture(scope="session") diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py index 064feb611..985a8bc37 100644 --- a/llama_stack/providers/tests/eval/conftest.py +++ b/llama_stack/providers/tests/eval/conftest.py @@ -34,6 +34,16 @@ DEFAULT_PROVIDER_COMBINATIONS = [ id="meta_reference_eval_together_inference", marks=pytest.mark.meta_reference_eval_together_inference, ), + pytest.param( + { + "eval": "meta_reference", + "scoring": "meta_reference", + "datasetio": "huggingface", + "inference": "together", + }, + id="meta_reference_eval_together_inference_huggingface_datasetio", + marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio, + ), ] @@ -41,6 +51,7 @@ def pytest_configure(config): for fixture_name in [ "meta_reference_eval_fireworks_inference", "meta_reference_eval_together_inference", + "meta_reference_eval_together_inference_huggingface_datasetio", ]: config.addinivalue_line( "markers", diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 6fe6c07fb..80a715bd2 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -98,3 +98,32 @@ class Testeval: assert len(eval_response.generations) == 5 assert "meta-reference::subset_of" in eval_response.scores assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores + + @pytest.mark.asyncio + async def test_eval_run_benchmark_eval(self, eval_stack): + eval_impl, eval_tasks_impl, _, _, datasetio_impl, datasets_impl = eval_stack + response = await datasets_impl.list_datasets() + assert len(response) == 1 + + rows = await datasetio_impl.get_rows_paginated( + dataset_id="llamastack_mmlu", + rows_in_page=3, + ) + assert len(rows.rows) == 3 + + scoring_functions = [ + "meta-reference::regex_parser_multiple_choice_answer", + ] + + response = await eval_impl.evaluate_rows( + input_rows=rows.rows, + scoring_functions=scoring_functions, + eval_task_config=AppEvalTaskConfig( + eval_candidate=ModelCandidate( + model="Llama3.2-3B-Instruct", + sampling_params=SamplingParams(), + ), + ), + ) + + print(response)