[Evals API][3/n] scoring_functions / scoring meta-reference implementations (#296)

* wip * dataset validation * test_scoring * cleanup * clean up test * comments * error checking * dataset client * test client: * datasetio client * clean up * basic scoring function works * scorer wip * equality scorer * score batch impl * score batch * update scoring test * refactor * validate scorer input * address comments * add all rows scores to ScoringResult * bugfix * scoring function def rename
2025-06-28 02:53:30 +00:00 · 2024-10-24 14:52:30 -07:00 · 2024-10-24 14:52:30 -07:00 · cb84034567
commit cb84034567
parent e70420a06e
28 changed files with 904 additions and 51 deletions
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -4,20 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Protocol,
-    runtime_checkable,
-    Union,
-)
+from typing import Any, Dict, List, Optional, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated

 from llama_stack.apis.common.type_system import ParamType

@ -33,45 +23,37 @@ class Parameter(BaseModel):
 # with standard metrics so they can be rolled up?


+class LLMAsJudgeContext(BaseModel):
+    judge_model: str
+    prompt_template: Optional[str] = None
+
+
@json_schema_type
-class CommonDef(BaseModel):
-    name: str
+class ScoringFunctionDef(BaseModel):
+    identifier: str
    description: Optional[str] = None
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="Any additional metadata for this definition",
    )
-    # Hack: same with memory_banks for union defs
-    provider_id: str = ""
-
-
-@json_schema_type
-class DeterministicFunctionDef(CommonDef):
-    type: Literal["deterministic"] = "deterministic"
    parameters: List[Parameter] = Field(
        description="List of parameters for the deterministic function",
+        default_factory=list,
    )
    return_type: ParamType = Field(
        description="The return type of the deterministic function",
    )
+    context: Optional[LLMAsJudgeContext] = None
    # We can optionally add information here to support packaging of code, etc.


@json_schema_type
-class LLMJudgeFunctionDef(CommonDef):
-    type: Literal["judge"] = "judge"
-    model: str = Field(
-        description="The LLM model to use for the judge function",
+class ScoringFunctionDefWithProvider(ScoringFunctionDef):
+    provider_id: str = Field(
+        description="ID of the provider which serves this dataset",
    )


-ScoringFunctionDef = Annotated[
-    Union[DeterministicFunctionDef, LLMJudgeFunctionDef], Field(discriminator="type")
-]
-
-ScoringFunctionDefWithProvider = ScoringFunctionDef
-
-
@runtime_checkable
 class ScoringFunctions(Protocol):
    @webmethod(route="/scoring_functions/list", method="GET")
@ -84,5 +66,5 @@ class ScoringFunctions(Protocol):

    @webmethod(route="/scoring_functions/register", method="POST")
    async def register_scoring_function(
-        self, function: ScoringFunctionDefWithProvider
+        self, function_def: ScoringFunctionDefWithProvider
    ) -> None: ...