Merge branch 'eval_task_register' into mmlu_benchmark

2025-10-16 14:57:20 +00:00 · 2024-11-07 14:41:50 -08:00 · 2024-11-07 14:41:50 -08:00 · cc6edf6287
commit cc6edf6287
parent 93995ecc4c 6b889651d6
72 changed files with 306 additions and 304 deletions
--- a/llama_stack/providers/tests/agents/fixtures.py
+++ b/llama_stack/providers/tests/agents/fixtures.py
@ -11,7 +11,7 @@ import pytest_asyncio

 from llama_stack.distribution.datatypes import Api, Provider

-from llama_stack.providers.inline.meta_reference.agents import (
+from llama_stack.providers.inline.agents.meta_reference import (
    MetaReferenceAgentsImplConfig,
 )

--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@ -52,7 +52,7 @@ class Testeval:
        response = await eval_impl.evaluate_rows(
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
-            eval_task_config=AppEvalTaskConfig(
+            task_config=AppEvalTaskConfig(
                eval_candidate=ModelCandidate(
                    model="Llama3.2-3B-Instruct",
                    sampling_params=SamplingParams(),
@ -76,13 +76,13 @@ class Testeval:
        ]

        response = await eval_impl.run_eval(
-            eval_task_def=EvalTaskDef(
+            task=EvalTaskDef(
                # NOTE: this is needed to make the router work for all app evals
                identifier="meta-reference::app_eval",
                dataset_id="test_dataset_for_eval",
                scoring_functions=scoring_functions,
            ),
-            eval_task_config=AppEvalTaskConfig(
+            task_config=AppEvalTaskConfig(
                eval_candidate=ModelCandidate(
                    model="Llama3.2-3B-Instruct",
                    sampling_params=SamplingParams(),
@ -90,9 +90,13 @@ class Testeval:
            ),
        )
        assert response.job_id == "0"
-        job_status = await eval_impl.job_status(response.job_id)
+        job_status = await eval_impl.job_status(
+            response.job_id, "meta-reference::app_eval"
+        )
        assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(response.job_id)
+        eval_response = await eval_impl.job_result(
+            response.job_id, "meta-reference::app_eval"
+        )

        assert eval_response is not None
        assert len(eval_response.generations) == 5
--- a/llama_stack/providers/tests/inference/fixtures.py
+++ b/llama_stack/providers/tests/inference/fixtures.py
@ -10,7 +10,7 @@ import pytest
 import pytest_asyncio

 from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.inline.meta_reference.inference import (
+from llama_stack.providers.inline.inference.meta_reference import (
    MetaReferenceInferenceConfig,
 )

--- a/llama_stack/providers/tests/memory/fixtures.py
+++ b/llama_stack/providers/tests/memory/fixtures.py
@ -11,7 +11,7 @@ import pytest
 import pytest_asyncio

 from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.inline.meta_reference.memory import FaissImplConfig
+from llama_stack.providers.inline.memory.faiss import FaissImplConfig
 from llama_stack.providers.remote.memory.pgvector import PGVectorConfig
 from llama_stack.providers.remote.memory.weaviate import WeaviateConfig

--- a/llama_stack/providers/tests/safety/fixtures.py
+++ b/llama_stack/providers/tests/safety/fixtures.py
@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio

 from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.inline.meta_reference.safety import (
+from llama_stack.providers.inline.safety.meta_reference import (
    LlamaGuardShieldConfig,
    SafetyConfig,
 )
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ b/llama_stack/providers/tests/scoring/test_scoring.py
@ -44,10 +44,10 @@ class TestScoring:
        )
        assert len(rows.rows) == 3

-        scoring_functions = [
-            "meta-reference::llm_as_judge_8b_correctness",
-            "meta-reference::equality",
-        ]
+        scoring_functions = {
+            "meta-reference::llm_as_judge_8b_correctness": None,
+            "meta-reference::equality": None,
+        }
        response = await scoring_impl.score(
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
@ -83,7 +83,7 @@ class TestScoring:
        )
        assert len(rows.rows) == 3

-        params = {
+        scoring_functions = {
            "meta-reference::llm_as_judge_8b_correctness": LLMAsJudgeScoringFnParams(
                judge_model="Llama3.1-405B-Instruct",
                prompt_template="Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9.",
@ -91,13 +91,9 @@ class TestScoring:
            )
        }

-        scoring_functions = [
-            "meta-reference::llm_as_judge_8b_correctness",
-        ]
        response = await scoring_impl.score(
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
-            scoring_params=params,
        )
        assert len(response.results) == len(scoring_functions)
        for x in scoring_functions:
@ -108,7 +104,6 @@ class TestScoring:
        response = await scoring_impl.score_batch(
            dataset_id="test_dataset",
            scoring_functions=scoring_functions,
-            scoring_params=params,
        )
        assert len(response.results) == len(scoring_functions)
        for x in scoring_functions: