[Evals API][10/n] API updates for EvalTaskDef + new test migration (#379)

* wip * scoring fn api * eval api * eval task * evaluate api update * pre commit * unwrap context -> config * config field doc * typo * naming fix * separate benchmark / app eval * api name * rename * wip tests * wip * datasetio test * delete unused * fixture * scoring resolve * fix scoring register * scoring test pass * score batch * scoring fix * fix eval * test eval works * remove type ignore * api refactor * add default task_eval_id for routing * add eval_id for jobs * remove type ignore * only keep 1 run_eval * fix optional * register task required * register task required * delete old tests * delete old tests * fixture return impl
2025-12-03 18:00:36 +00:00 · 2024-11-07 21:24:12 -08:00 · 2024-11-07 21:24:12 -08:00 · 6192bf43a4
commit 6192bf43a4
parent 8350f2df4c
32 changed files with 916 additions and 389 deletions
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -4,34 +4,66 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+from enum import Enum
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    runtime_checkable,
+    Union,
+)

 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
+from typing_extensions import Annotated

 from llama_stack.apis.common.type_system import ParamType


-@json_schema_type
-class Parameter(BaseModel):
-    name: str
-    type: ParamType
-    description: Optional[str] = None
-
-
 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
 # with standard metrics so they can be rolled up?
+@json_schema_type
+class ScoringConfigType(Enum):
+    llm_as_judge = "llm_as_judge"
+    regex_parser = "regex_parser"


-class LLMAsJudgeContext(BaseModel):
+@json_schema_type
+class LLMAsJudgeScoringFnParams(BaseModel):
+    type: Literal[ScoringConfigType.llm_as_judge.value] = (
+        ScoringConfigType.llm_as_judge.value
+    )
    judge_model: str
    prompt_template: Optional[str] = None
-    judge_score_regex: Optional[List[str]] = Field(
-        description="Regex to extract the score from the judge response",
-        default=None,
+    judge_score_regexes: Optional[List[str]] = Field(
+        description="Regexes to extract the answer from generated response",
+        default_factory=list,
    )


+@json_schema_type
+class RegexParserScoringFnParams(BaseModel):
+    type: Literal[ScoringConfigType.regex_parser.value] = (
+        ScoringConfigType.regex_parser.value
+    )
+    parsing_regexes: Optional[List[str]] = Field(
+        description="Regex to extract the answer from generated response",
+        default_factory=list,
+    )
+
+
+ScoringFnParams = Annotated[
+    Union[
+        LLMAsJudgeScoringFnParams,
+        RegexParserScoringFnParams,
+    ],
+    Field(discriminator="type"),
+]
+
+
@json_schema_type
 class ScoringFnDef(BaseModel):
    identifier: str
@ -40,14 +72,13 @@ class ScoringFnDef(BaseModel):
        default_factory=dict,
        description="Any additional metadata for this definition",
    )
-    parameters: List[Parameter] = Field(
-        description="List of parameters for the deterministic function",
-        default_factory=list,
-    )
    return_type: ParamType = Field(
        description="The return type of the deterministic function",
    )
-    context: Optional[LLMAsJudgeContext] = None
+    params: Optional[ScoringFnParams] = Field(
+        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
+        default=None,
+    )
    # We can optionally add information here to support packaging of code, etc.