separate benchmark / app eval

2025-12-17 11:12:36 +00:00 · 2024-11-05 16:54:31 -08:00 · 2024-11-05 16:54:31 -08:00 · 4a64f98c82
commit 4a64f98c82
parent 979cd4cd44
2 changed files with 10 additions and 18 deletions
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -38,47 +38,39 @@ EvalCandidate = Annotated[
@json_schema_type
 class BenchmarkEvalTaskConfig(BaseModel):
    type: Literal["benchmark"] = "benchmark"
    eval_candidate: EvalCandidate  # type: ignore
@json_schema_type
 class AppEvalTaskConfig(BaseModel):
    type: Literal["app"] = "app"
    eval_candidate: EvalCandidate  # type: ignore
    scoring_functions_params: Dict[str, ScoringFnParams] = Field(  # type: ignore
        description="Map between scoring function id and parameters",
        default_factory=dict,
    )
-    # we could optinally add any GenEval specific dataset config here
+    # we could optinally add any specific dataset config here
 EvalTaskConfig = Annotated[
    Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
 ]
@json_schema_type
 class EvaluateResponse(BaseModel):
    generations: List[Dict[str, Any]]
    # each key in the dict is a scoring function name
    scores: Dict[str, ScoringResult]
 class Eval(Protocol):
-    @webmethod(route="/eval/evaluate_batch", method="POST")
+    @webmethod(route="/eval/run_benchmark", method="POST")
-    async def evaluate_task(
+    async def run_benchmark(
        self,
-        eval_task_id: str,
+        benchmark_id: str,
-        eval_task_config: EvalTaskConfig,  # type: ignore
+        eval_task_config: BenchmarkEvalTaskConfig,  # type: ignore
    ) -> Job: ...
-    @webmethod(route="/eval/evaluate_batch", method="POST")
+    @webmethod(route="/eval/run_app_eval", method="POST")
-    async def evaluate_batch(
+    async def run_app_eval(
        self,
-        eval_task_def: Union[str, EvalTaskDef],  # type: ignore
+        eval_task_def: EvalTaskDef,  # type: ignore
-        eval_task_config: EvalTaskConfig,  # type: ignore
+        eval_task_config: AppEvalTaskConfig,  # type: ignore
    ) -> Job: ...
    @webmethod(route="/eval/evaluate", method="POST")
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@ -17,7 +17,7 @@ class EvalTaskDef(BaseModel):
    scoring_functions: List[str]
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
-        description="Metadata for this evaluation task (e.g. from GECO)",
+        description="Metadata for this evaluation task",
    )