[Evals API][10/n] API updates for EvalTaskDef + new test migration (#379)

* wip * scoring fn api * eval api * eval task * evaluate api update * pre commit * unwrap context -> config * config field doc * typo * naming fix * separate benchmark / app eval * api name * rename * wip tests * wip * datasetio test * delete unused * fixture * scoring resolve * fix scoring register * scoring test pass * score batch * scoring fix * fix eval * test eval works * remove type ignore * api refactor * add default task_eval_id for routing * add eval_id for jobs * remove type ignore * only keep 1 run_eval * fix optional * register task required * register task required * delete old tests * delete old tests * fixture return impl
2024-11-07 21:24:12 -08:00 · 2024-11-07 21:24:12 -08:00 · 6192bf43a4
commit 6192bf43a4
parent 8350f2df4c
32 changed files with 916 additions and 389 deletions
--- a/llama_stack/providers/tests/eval/conftest.py
+++ b/llama_stack/providers/tests/eval/conftest.py
@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+from ..conftest import get_provider_fixture_overrides
+
+from ..datasetio.fixtures import DATASETIO_FIXTURES
+from ..inference.fixtures import INFERENCE_FIXTURES
+from ..scoring.fixtures import SCORING_FIXTURES
+from .fixtures import EVAL_FIXTURES
+
+DEFAULT_PROVIDER_COMBINATIONS = [
+    pytest.param(
+        {
+            "eval": "meta_reference",
+            "scoring": "meta_reference",
+            "datasetio": "meta_reference",
+            "inference": "fireworks",
+        },
+        id="meta_reference_eval_fireworks_inference",
+        marks=pytest.mark.meta_reference_eval_fireworks_inference,
+    ),
+    pytest.param(
+        {
+            "eval": "meta_reference",
+            "scoring": "meta_reference",
+            "datasetio": "meta_reference",
+            "inference": "together",
+        },
+        id="meta_reference_eval_together_inference",
+        marks=pytest.mark.meta_reference_eval_together_inference,
+    ),
+]
+
+
+def pytest_configure(config):
+    for fixture_name in [
+        "meta_reference_eval_fireworks_inference",
+        "meta_reference_eval_together_inference",
+    ]:
+        config.addinivalue_line(
+            "markers",
+            f"{fixture_name}: marks tests as {fixture_name} specific",
+        )
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--inference-model",
+        action="store",
+        default="Llama3.2-3B-Instruct",
+        help="Specify the inference model to use for testing",
+    )
+
+
+def pytest_generate_tests(metafunc):
+    if "eval_stack" in metafunc.fixturenames:
+        available_fixtures = {
+            "eval": EVAL_FIXTURES,
+            "scoring": SCORING_FIXTURES,
+            "datasetio": DATASETIO_FIXTURES,
+            "inference": INFERENCE_FIXTURES,
+        }
+        combinations = (
+            get_provider_fixture_overrides(metafunc.config, available_fixtures)
+            or DEFAULT_PROVIDER_COMBINATIONS
+        )
+        metafunc.parametrize("eval_stack", combinations, indirect=True)