fix!: update eval-tasks -> benchmarks (#1032)

# What does this PR do? - Update `/eval-tasks` to `/benchmarks` - ⚠️ Remove differentiation between `app` v.s. `benchmark` eval task config. Now we only have `BenchmarkConfig`. The overloaded `benchmark` is confusing and do not add any value. Backward compatibility is being kept as the "type" is not being used anywhere. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan - This change is backward compatible - Run notebook test with ``` pytest -v -s --nbval-lax ./docs/getting_started.ipynb pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="846" alt="image" src="https://github.com/user-attachments/assets/d2fc06a7-593a-444f-bc1f-10ab9b0c843d" /> [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --------- Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Sébastien Han <seb@redhat.com> Signed-off-by: reidliu <reid201711@gmail.com> Co-authored-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Co-authored-by: Ben Browning <ben324@gmail.com> Co-authored-by: Sébastien Han <seb@redhat.com> Co-authored-by: Reid <61492567+reidliu41@users.noreply.github.com> Co-authored-by: reidliu <reid201711@gmail.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
2025-10-08 04:54:38 +00:00 · 2025-02-13 16:40:58 -08:00 · 2025-02-13 16:40:58 -08:00 · 8b655e3cd2
commit 8b655e3cd2
parent 225dd38e5c
60 changed files with 2622 additions and 1910 deletions
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -10,9 +10,9 @@ from urllib.parse import urlparse
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasets import Dataset
 from llama_stack.apis.datatypes import Api
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...


-class EvalTasksProtocolPrivate(Protocol):
-    async def register_eval_task(self, eval_task: EvalTask) -> None: ...
+class BenchmarksProtocolPrivate(Protocol):
+    async def register_benchmark(self, benchmark: Benchmark) -> None: ...


 class ToolsProtocolPrivate(Protocol):
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
 from tqdm import tqdm

 from llama_stack.apis.agents import Agents, StepType
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.inference import Inference, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
    MEMORY_QUERY_TOOL,
 )
@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 from llama_stack.providers.utils.kvstore import kvstore_impl

 from .....apis.common.job_types import Job
-from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
 from .config import MetaReferenceEvalConfig

-EVAL_TASKS_PREFIX = "eval_tasks:"
+EVAL_TASKS_PREFIX = "benchmarks:"


 class MetaReferenceEvalImpl(
    Eval,
-    EvalTasksProtocolPrivate,
+    BenchmarksProtocolPrivate,
 ):
    def __init__(
        self,
@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
        # TODO: assume sync job, will need jobs API for async scheduling
        self.jobs = {}

-        self.eval_tasks = {}
+        self.benchmarks = {}

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing eval_tasks from kvstore
+        # Load existing benchmarks from kvstore
        start_key = EVAL_TASKS_PREFIX
        end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_eval_tasks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.range(start_key, end_key)

-        for eval_task in stored_eval_tasks:
-            eval_task = EvalTask.model_validate_json(eval_task)
-            self.eval_tasks[eval_task.identifier] = eval_task
+        for benchmark in stored_benchmarks:
+            benchmark = Benchmark.model_validate_json(benchmark)
+            self.benchmarks[benchmark.identifier] = benchmark

    async def shutdown(self) -> None: ...

-    async def register_eval_task(self, task_def: EvalTask) -> None:
+    async def register_benchmark(self, task_def: Benchmark) -> None:
        # Store in kvstore
        key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
        await self.kvstore.set(
            key=key,
            value=task_def.model_dump_json(),
        )
-        self.eval_tasks[task_def.identifier] = task_def
+        self.benchmarks[task_def.identifier] = task_def

    async def run_eval(
        self,
-        task_id: str,
-        task_config: EvalTaskConfig,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
    ) -> Job:
-        task_def = self.eval_tasks[task_id]
+        task_def = self.benchmarks[benchmark_id]
        dataset_id = task_def.dataset_id
        candidate = task_config.eval_candidate
        scoring_functions = task_def.scoring_functions
@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
            rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
        )
        res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
            input_rows=all_rows.rows,
            scoring_functions=scoring_functions,
            task_config=task_config,
@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
        return Job(job_id=job_id)

    async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
        candidate = task_config.eval_candidate
        create_response = await self.agents_api.create_agent(candidate.config)
@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
        return generations

    async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
        candidate = task_config.eval_candidate
        assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(

    async def evaluate_rows(
        self,
-        task_id: str,
+        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
    ) -> EvaluateResponse:
        candidate = task_config.eval_candidate
        if candidate.type == "agent":
@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
        # scoring with generated_answer
        score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]

-        if task_config.type == "app" and task_config.scoring_params is not None:
+        if task_config.scoring_params is not None:
            scoring_functions_dict = {
                scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
                for scoring_fn_id in scoring_functions
@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(

        return EvaluateResponse(generations=generations, scores=score_response.results)

-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
        if job_id in self.jobs:
            return JobStatus.completed

        return None

-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        raise NotImplementedError("Job cancel is not implemented yet")

-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
        if not status or status != JobStatus.completed:
            raise ValueError(f"Job is not completed, Status: {status.value}")

        return self.jobs[job_id]
+
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@ -10,8 +10,8 @@ import pytest
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
 from llama_stack.apis.eval.eval import (
-    AppEvalTaskConfig,
-    BenchmarkEvalTaskConfig,
+    AppBenchmarkConfig,
+    BenchmarkBenchmarkConfig,
    ModelCandidate,
 )
 from llama_stack.apis.inference import SamplingParams
@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT

 class Testeval:
    @pytest.mark.asyncio
-    async def test_eval_tasks_list(self, eval_stack):
+    async def test_benchmarks_list(self, eval_stack):
        # NOTE: this needs you to ensure that you are starting from a clean state
        # but so far we don't have an unregister API unfortunately, so be careful
-        eval_tasks_impl = eval_stack[Api.eval_tasks]
-        response = await eval_tasks_impl.list_eval_tasks()
+        benchmarks_impl = eval_stack[Api.benchmarks]
+        response = await benchmarks_impl.list_benchmarks()
        assert isinstance(response, list)

    @pytest.mark.asyncio
    async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
            eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
            eval_stack[Api.datasetio],
            eval_stack[Api.datasets],
            eval_stack[Api.models],
@ -59,17 +59,17 @@ class Testeval:
        scoring_functions = [
            "basic::equality",
        ]
-        task_id = "meta-reference::app_eval"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
            dataset_id="test_dataset_for_eval",
            scoring_functions=scoring_functions,
        )
        response = await eval_impl.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
-            task_config=AppEvalTaskConfig(
+            task_config=AppBenchmarkConfig(
                eval_candidate=ModelCandidate(
                    model=inference_model,
                    sampling_params=SamplingParams(),
@ -92,9 +92,9 @@ class Testeval:

    @pytest.mark.asyncio
    async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
            eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
            eval_stack[Api.datasets],
            eval_stack[Api.models],
        )
@ -105,15 +105,15 @@ class Testeval:
            "basic::subset_of",
        ]

-        task_id = "meta-reference::app_eval-2"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval-2"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
            dataset_id="test_dataset_for_eval",
            scoring_functions=scoring_functions,
        )
        response = await eval_impl.run_eval(
-            task_id=task_id,
-            task_config=AppEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=AppBenchmarkConfig(
                eval_candidate=ModelCandidate(
                    model=inference_model,
                    sampling_params=SamplingParams(),
@ -121,9 +121,9 @@ class Testeval:
            ),
        )
        assert response.job_id == "0"
-        job_status = await eval_impl.job_status(task_id, response.job_id)
+        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
        assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(task_id, response.job_id)
+        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)

        assert eval_response is not None
        assert len(eval_response.generations) == 5
@ -131,9 +131,9 @@ class Testeval:

    @pytest.mark.asyncio
    async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
            eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
            eval_stack[Api.datasets],
            eval_stack[Api.models],
        )
@ -159,20 +159,20 @@ class Testeval:
        )

        # register eval task
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id="meta-reference-mmlu",
+        await benchmarks_impl.register_benchmark(
+            benchmark_id="meta-reference-mmlu",
            dataset_id="mmlu",
            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
        )

        # list benchmarks
-        response = await eval_tasks_impl.list_eval_tasks()
+        response = await benchmarks_impl.list_benchmarks()
        assert len(response) > 0

        benchmark_id = "meta-reference-mmlu"
        response = await eval_impl.run_eval(
-            task_id=benchmark_id,
-            task_config=BenchmarkEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=BenchmarkBenchmarkConfig(
                eval_candidate=ModelCandidate(
                    model=inference_model,
                    sampling_params=SamplingParams(),
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional

 from pydantic import BaseModel

+from llama_stack.apis.benchmarks import BenchmarkInput
 from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.eval_tasks import EvalTaskInput
 from llama_stack.apis.models import ModelInput
 from llama_stack.apis.scoring_functions import ScoringFnInput
 from llama_stack.apis.shields import ShieldInput
@ -42,7 +42,7 @@ async def construct_stack_for_test(
    vector_dbs: Optional[List[VectorDBInput]] = None,
    datasets: Optional[List[DatasetInput]] = None,
    scoring_fns: Optional[List[ScoringFnInput]] = None,
-    eval_tasks: Optional[List[EvalTaskInput]] = None,
+    benchmarks: Optional[List[BenchmarkInput]] = None,
    tool_groups: Optional[List[ToolGroupInput]] = None,
 ) -> TestStack:
    sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@ -56,7 +56,7 @@ async def construct_stack_for_test(
        vector_dbs=vector_dbs or [],
        datasets=datasets or [],
        scoring_fns=scoring_fns or [],
-        eval_tasks=eval_tasks or [],
+        benchmarks=benchmarks or [],
        tool_groups=tool_groups or [],
    )
    run_config = parse_and_maybe_upgrade_config(run_config)