aggregation function config

2025-12-17 13:49:47 +00:00 · 2024-12-10 15:46:46 -08:00 · 2024-12-10 15:46:46 -08:00 · fbc3888fd7
commit fbc3888fd7
parent e2054d53e4
10 changed files with 189 additions and 26 deletions
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -31,6 +31,15 @@ from llama_stack.apis.resource import Resource, ResourceType
 class ScoringFnParamsType(Enum):
    llm_as_judge = "llm_as_judge"
    regex_parser = "regex_parser"
    basic = "basic"
@json_schema_type
 class AggregationFunctionType(Enum):
    average = "average"
    median = "median"
    categorical_count = "categorical_count"
    accuracy = "accuracy"
@json_schema_type
@ -44,6 +53,10 @@ class LLMAsJudgeScoringFnParams(BaseModel):
        description="Regexes to extract the answer from generated response",
        default_factory=list,
    )
    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
        description="Aggregation functions to apply to the scores of each row",
        default_factory=list,
    )
@json_schema_type
@ -55,12 +68,26 @@ class RegexParserScoringFnParams(BaseModel):
        description="Regex to extract the answer from generated response",
        default_factory=list,
    )
    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
        description="Aggregation functions to apply to the scores of each row",
        default_factory=list,
    )
@json_schema_type
 class BasicScoringFnParams(BaseModel):
    type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value
    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
        description="Aggregation functions to apply to the scores of each row",
        default_factory=list,
    )
 ScoringFnParams = Annotated[
    Union[
        LLMAsJudgeScoringFnParams,
        RegexParserScoringFnParams,
        BasicScoringFnParams,
    ],
    Field(discriminator="type"),
 ]
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -113,7 +113,7 @@ class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
            score_results = await scoring_fn.score(
                input_rows, scoring_fn_id, scoring_fn_params
            )
-            agg_results = await scoring_fn.aggregate(score_results)
+            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_params)
            res[scoring_fn_id] = ScoringResult(
                score_rows=score_results,
                aggregated_results=agg_results,
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
@ -4,12 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
+from typing import Any, Dict, List, Optional
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
+from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import AggregationFunctionType, ScoringFnParams
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
 from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
 from .fn_defs.equality import equality
@ -44,6 +45,15 @@ class EqualityScoringFn(BaseScoringFn):
        }
    async def aggregate(
-        self, scoring_results: List[ScoringResultRow]
+        self,
        scoring_results: List[ScoringResultRow],
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> Dict[str, Any]:
-        return aggregate_accuracy(scoring_results)
+        aggregation_functions = [AggregationFunctionType.accuracy]
        if (
            scoring_params
            and hasattr(scoring_params, "aggregation_functions")
            and scoring_params.aggregation_functions
        ):
            aggregation_functions.extend(scoring_params.aggregation_functions)
        return aggregate_metrics(scoring_results, aggregation_functions)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
@ -5,11 +5,16 @@
 # the root directory of this source tree.
 import re
 from typing import Any, Dict, List, Optional
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import (
    AggregationFunctionType,
    ScoringFnParams,
    ScoringFnParamsType,
 )
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
 from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
 from .fn_defs.regex_parser_multiple_choice_answer import (
    regex_parser_multiple_choice_answer,
@ -62,6 +67,15 @@ class RegexParserScoringFn(BaseScoringFn):
        }
    async def aggregate(
-        self, scoring_results: List[ScoringResultRow]
+        self,
        scoring_results: List[ScoringResultRow],
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> Dict[str, Any]:
-        return aggregate_accuracy(scoring_results)
+        aggregation_functions = [AggregationFunctionType.accuracy]
        if (
            scoring_params
            and hasattr(scoring_params, "aggregation_functions")
            and scoring_params.aggregation_functions
        ):
            aggregation_functions.extend(scoring_params.aggregation_functions)
        return aggregate_metrics(scoring_results, aggregation_functions)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
@ -4,11 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, List, Optional
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import AggregationFunctionType, ScoringFnParams
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
 from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
 from .fn_defs.subset_of import subset_of
@ -38,6 +39,15 @@ class SubsetOfScoringFn(BaseScoringFn):
        }
    async def aggregate(
-        self, scoring_results: List[ScoringResultRow]
+        self,
        scoring_results: List[ScoringResultRow],
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> Dict[str, Any]:
-        return aggregate_accuracy(scoring_results)
+        aggregation_functions = [AggregationFunctionType.accuracy]
        if (
            scoring_params
            and hasattr(scoring_params, "aggregation_functions")
            and scoring_params.aggregation_functions
        ):
            aggregation_functions.extend(scoring_params.aggregation_functions)
        return aggregate_metrics(scoring_results, aggregation_functions)
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -120,7 +120,7 @@ class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
            score_results = await scoring_fn.score(
                input_rows, scoring_fn_id, scoring_fn_params
            )
-            agg_results = await scoring_fn.aggregate(score_results)
+            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_params)
            res[scoring_fn_id] = ScoringResult(
                score_rows=score_results,
                aggregated_results=agg_results,
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@ -87,7 +87,10 @@ class LlmAsJudgeScoringFn(BaseScoringFn):
        }
    async def aggregate(
-        self, scoring_results: List[ScoringResultRow]
+        self,
        scoring_results: List[ScoringResultRow],
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> Dict[str, Any]:
        print(f"scoring_params: {scoring_params}")
        # TODO: this needs to be config based aggregation, and only useful w/ Jobs API
        return {}
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ b/llama_stack/providers/tests/scoring/test_scoring.py
@ -7,7 +7,12 @@
 import pytest
-from llama_stack.apis.scoring_functions import *  # noqa: F403
+from llama_stack.apis.scoring_functions import (
    AggregationFunctionType,
    BasicScoringFnParams,
    LLMAsJudgeScoringFnParams,
    RegexParserScoringFnParams,
 )
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
@ -129,7 +134,7 @@ class TestScoring:
        assert len(rows.rows) == 3
        scoring_functions = {
-            "llm-as-judge::llm_as_judge_base": LLMAsJudgeScoringFnParams(
+            "llm-as-judge::base": LLMAsJudgeScoringFnParams(
                judge_model="Llama3.1-405B-Instruct",
                prompt_template="Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9.",
                judge_score_regexes=[r"Score: (\d+)"],
@ -154,3 +159,59 @@ class TestScoring:
        for x in scoring_functions:
            assert x in response.results
            assert len(response.results[x].score_rows) == 5
    @pytest.mark.asyncio
    async def test_scoring_score_with_aggregation_functions(self, scoring_stack):
        (
            scoring_impl,
            scoring_functions_impl,
            datasetio_impl,
            datasets_impl,
            models_impl,
        ) = (
            scoring_stack[Api.scoring],
            scoring_stack[Api.scoring_functions],
            scoring_stack[Api.datasetio],
            scoring_stack[Api.datasets],
            scoring_stack[Api.models],
        )
        await register_dataset(datasets_impl)
        rows = await datasetio_impl.get_rows_paginated(
            dataset_id="test_dataset",
            rows_in_page=3,
        )
        assert len(rows.rows) == 3
        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
        provider_id = scoring_fns_list[0].provider_id
        scoring_functions = {}
        aggr_fns = [
            AggregationFunctionType.accuracy,
            AggregationFunctionType.median,
            AggregationFunctionType.categorical_count,
            AggregationFunctionType.average,
        ]
        for x in scoring_fns_list:
            if x.provider_id == "llm-as-judge":
                scoring_functions[x.identifier] = LLMAsJudgeScoringFnParams(
                    aggregation_functions=[AggregationFunctionType.categorical_count],
                )
            elif x.provider_id == "basic":
                if "regex_parser" in x.identifier:
                    scoring_functions[x.identifier] = RegexParserScoringFnParams(
                        aggregation_functions=aggr_fns,
                    )
                else:
                    scoring_functions[x.identifier] = BasicScoringFnParams(
                        aggregation_functions=aggr_fns,
                    )
            else:
                scoring_functions[x.identifier] = None
        response = await scoring_impl.score(
            input_rows=rows.rows,
            scoring_functions=scoring_functions,
        )
        from rich.pretty import pprint
        pprint(response)
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import statistics
 from typing import Any, Dict, List
-from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring import AggregationFunctionType, ScoringResultRow
 def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
@ -26,3 +27,38 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
        )
        / len([_ for _ in scoring_results if _["score"] is not None]),
    }
 def aggregate_categorical_count(
    scoring_results: List[ScoringResultRow],
 ) -> Dict[str, Any]:
    scores = [str(r["score"]) for r in scoring_results]
    unique_scores = sorted(list(set(scores)))
    return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
 def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
    scores = [r["score"] for r in scoring_results if r["score"] is not None]
    median = statistics.median(scores) if scores else None
    return {"median": median}
 # TODO: decide whether we want to make aggregation functions as a registerable resource
 AGGREGATION_FUNCTIONS = {
    AggregationFunctionType.accuracy: aggregate_accuracy,
    AggregationFunctionType.average: aggregate_average,
    AggregationFunctionType.categorical_count: aggregate_categorical_count,
    AggregationFunctionType.median: aggregate_median,
 }
 def aggregate_metrics(
    scoring_results: List[ScoringResultRow], metrics: List[AggregationFunctionType]
 ) -> Dict[str, Any]:
    agg_results = {}
    for metric in metrics:
        if metric not in AGGREGATION_FUNCTIONS:
            raise ValueError(f"Aggregation function {metric} not found")
        agg_fn = AGGREGATION_FUNCTIONS[metric]
        agg_results[metric] = agg_fn(scoring_results)
    return agg_results
--- a/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ b/llama_stack/providers/utils/scoring/base_scoring_fn.py
@ -46,7 +46,9 @@ class BaseScoringFn(ABC):
    @abstractmethod
    async def aggregate(
-        self, scoring_results: List[ScoringResultRow]
+        self,
        scoring_results: List[ScoringResultRow],
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> Dict[str, Any]:
        raise NotImplementedError()