[/scoring] add ability to define aggregation functions for scoring functions & refactors (#597)

# What does this PR do?

- Add ability to define aggregation functions for scoring functions via
`ScoringFnParams`
- Supported by `basic` / `regex_parser` / `llm_as_judge` scoring
functions


## Test Plan

```
pytest -v -s -m basic_scoring_together_inference scoring/test_scoring.py
```
<img width="855" alt="image"
src="https://github.com/user-attachments/assets/12db8e6e-2ad4-462e-b9b9-70ba6c050a6c">


```
pytest -v -s -m llm_as_judge_scoring_together_inference scoring/test_scoring.py
```
<img width="858" alt="image"
src="https://github.com/user-attachments/assets/bf806676-6f5e-456d-be9f-f81a26d1df19">



**Example Response** (`basic`)
<img width="863" alt="image"
src="https://github.com/user-attachments/assets/0e57a49c-8386-45cc-8fa9-3e61aaa9a3be">

**Example Response** (`llm-as-judge`)
<img width="854" alt="image"
src="https://github.com/user-attachments/assets/38065bc2-b724-47ed-9535-79b6099c4362">


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
This commit is contained in:
Xi Yan 2024-12-11 10:03:42 -08:00 committed by GitHub
parent e128f2547a
commit a4bcfb8bba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 323 additions and 55 deletions

View file

@ -4926,6 +4926,15 @@
"config"
]
},
"AggregationFunctionType": {
"type": "string",
"enum": [
"average",
"median",
"categorical_count",
"accuracy"
]
},
"AppEvalTaskConfig": {
"type": "object",
"properties": {
@ -4953,6 +4962,9 @@
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
]
}
@ -4968,6 +4980,26 @@
"scoring_params"
]
},
"BasicScoringFnParams": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "basic",
"default": "basic"
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
"required": [
"type"
]
},
"BenchmarkEvalTaskConfig": {
"type": "object",
"properties": {
@ -5015,6 +5047,12 @@
"items": {
"type": "string"
}
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
@ -5061,6 +5099,12 @@
"items": {
"type": "string"
}
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
@ -6014,6 +6058,9 @@
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
]
}
@ -7771,6 +7818,9 @@
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
]
}
@ -7998,6 +8048,9 @@
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
]
},
@ -8046,6 +8099,9 @@
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
]
},
@ -8491,6 +8547,10 @@
{
"name": "Agents"
},
{
"name": "AggregationFunctionType",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/AggregationFunctionType\" />"
},
{
"name": "AppEvalTaskConfig",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/AppEvalTaskConfig\" />"
@ -8503,6 +8563,10 @@
"name": "Attachment",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Attachment\" />"
},
{
"name": "BasicScoringFnParams",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/BasicScoringFnParams\" />"
},
{
"name": "BatchChatCompletionRequest",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchChatCompletionRequest\" />"
@ -9146,9 +9210,11 @@
"AgentTurnResponseStreamChunk",
"AgentTurnResponseTurnCompletePayload",
"AgentTurnResponseTurnStartPayload",
"AggregationFunctionType",
"AppEvalTaskConfig",
"AppendRowsRequest",
"Attachment",
"BasicScoringFnParams",
"BatchChatCompletionRequest",
"BatchChatCompletionResponse",
"BatchCompletionRequest",

View file

@ -216,6 +216,13 @@ components:
- event_type
- turn_id
type: object
AggregationFunctionType:
enum:
- average
- median
- categorical_count
- accuracy
type: string
AppEvalTaskConfig:
additionalProperties: false
properties:
@ -230,6 +237,7 @@ components:
oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
type: object
type:
const: app
@ -280,6 +288,20 @@ components:
- content
- mime_type
type: object
BasicScoringFnParams:
additionalProperties: false
properties:
aggregation_functions:
items:
$ref: '#/components/schemas/AggregationFunctionType'
type: array
type:
const: basic
default: basic
type: string
required:
- type
type: object
BatchChatCompletionRequest:
additionalProperties: false
properties:
@ -1280,6 +1302,10 @@ components:
LLMAsJudgeScoringFnParams:
additionalProperties: false
properties:
aggregation_functions:
items:
$ref: '#/components/schemas/AggregationFunctionType'
type: array
judge_model:
type: string
judge_score_regexes:
@ -1984,6 +2010,10 @@ components:
RegexParserScoringFnParams:
additionalProperties: false
properties:
aggregation_functions:
items:
$ref: '#/components/schemas/AggregationFunctionType'
type: array
parsing_regexes:
items:
type: string
@ -2195,6 +2225,7 @@ components:
oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
provider_id:
type: string
provider_scoring_fn_id:
@ -2515,6 +2546,7 @@ components:
- oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
- type: 'null'
type: object
required:
@ -2555,6 +2587,7 @@ components:
- oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
- type: 'null'
type: object
required:
@ -2592,6 +2625,7 @@ components:
oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
provider_id:
type: string
provider_resource_id:
@ -5161,6 +5195,9 @@ tags:
/>
name: AgentTurnResponseTurnStartPayload
- name: Agents
- description: <SchemaDefinition schemaRef="#/components/schemas/AggregationFunctionType"
/>
name: AggregationFunctionType
- description: <SchemaDefinition schemaRef="#/components/schemas/AppEvalTaskConfig"
/>
name: AppEvalTaskConfig
@ -5169,6 +5206,9 @@ tags:
name: AppendRowsRequest
- description: <SchemaDefinition schemaRef="#/components/schemas/Attachment" />
name: Attachment
- description: <SchemaDefinition schemaRef="#/components/schemas/BasicScoringFnParams"
/>
name: BasicScoringFnParams
- description: <SchemaDefinition schemaRef="#/components/schemas/BatchChatCompletionRequest"
/>
name: BatchChatCompletionRequest
@ -5636,9 +5676,11 @@ x-tagGroups:
- AgentTurnResponseStreamChunk
- AgentTurnResponseTurnCompletePayload
- AgentTurnResponseTurnStartPayload
- AggregationFunctionType
- AppEvalTaskConfig
- AppendRowsRequest
- Attachment
- BasicScoringFnParams
- BatchChatCompletionRequest
- BatchChatCompletionResponse
- BatchCompletionRequest

View file

@ -31,6 +31,15 @@ from llama_stack.apis.resource import Resource, ResourceType
class ScoringFnParamsType(Enum):
llm_as_judge = "llm_as_judge"
regex_parser = "regex_parser"
basic = "basic"
@json_schema_type
class AggregationFunctionType(Enum):
average = "average"
median = "median"
categorical_count = "categorical_count"
accuracy = "accuracy"
@json_schema_type
@ -44,6 +53,10 @@ class LLMAsJudgeScoringFnParams(BaseModel):
description="Regexes to extract the answer from generated response",
default_factory=list,
)
aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
description="Aggregation functions to apply to the scores of each row",
default_factory=list,
)
@json_schema_type
@ -55,12 +68,26 @@ class RegexParserScoringFnParams(BaseModel):
description="Regex to extract the answer from generated response",
default_factory=list,
)
aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
description="Aggregation functions to apply to the scores of each row",
default_factory=list,
)
@json_schema_type
class BasicScoringFnParams(BaseModel):
type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value
aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
description="Aggregation functions to apply to the scores of each row",
default_factory=list,
)
ScoringFnParams = Annotated[
Union[
LLMAsJudgeScoringFnParams,
RegexParserScoringFnParams,
BasicScoringFnParams,
],
Field(discriminator="type"),
]

View file

@ -113,7 +113,9 @@ class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
score_results = await scoring_fn.score(
input_rows, scoring_fn_id, scoring_fn_params
)
agg_results = await scoring_fn.aggregate(score_results)
agg_results = await scoring_fn.aggregate(
score_results, scoring_fn_id, scoring_fn_params
)
res[scoring_fn_id] = ScoringResult(
score_rows=score_results,
aggregated_results=agg_results,

View file

@ -4,12 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
from llama_stack.apis.scoring import * # noqa: F401, F403
from llama_stack.apis.common.type_system import * # noqa: F403
from typing import Any, Dict, Optional
from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
from llama_stack.apis.scoring import ScoringResultRow
from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
from .fn_defs.equality import equality
@ -42,8 +42,3 @@ class EqualityScoringFn(BaseScoringFn):
return {
"score": score,
}
async def aggregate(
self, scoring_results: List[ScoringResultRow]
) -> Dict[str, Any]:
return aggregate_accuracy(scoring_results)

View file

@ -5,14 +5,20 @@
# the root directory of this source tree.
from llama_stack.apis.common.type_system import NumberType
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.apis.scoring_functions import (
AggregationFunctionType,
BasicScoringFnParams,
ScoringFn,
)
equality = ScoringFn(
identifier="basic::equality",
description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
params=None,
provider_id="basic",
provider_resource_id="equality",
return_type=NumberType(),
params=BasicScoringFnParams(
aggregation_functions=[AggregationFunctionType.accuracy]
),
)

View file

@ -4,9 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
from llama_stack.apis.scoring import * # noqa: F401, F403
from llama_stack.apis.common.type_system import NumberType
from llama_stack.apis.scoring_functions import (
AggregationFunctionType,
RegexParserScoringFnParams,
ScoringFn,
)
MULTILINGUAL_ANSWER_REGEXES = [
r"Answer\s*:",
@ -67,5 +70,6 @@ regex_parser_multiple_choice_answer = ScoringFn(
MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(x)
for x in MULTILINGUAL_ANSWER_REGEXES
],
aggregation_functions=[AggregationFunctionType.accuracy],
),
)

View file

@ -5,7 +5,11 @@
# the root directory of this source tree.
from llama_stack.apis.common.type_system import NumberType
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.apis.scoring_functions import (
AggregationFunctionType,
BasicScoringFnParams,
ScoringFn,
)
subset_of = ScoringFn(
@ -14,4 +18,7 @@ subset_of = ScoringFn(
return_type=NumberType(),
provider_id="basic",
provider_resource_id="subset-of",
params=BasicScoringFnParams(
aggregation_functions=[AggregationFunctionType.accuracy]
),
)

View file

@ -5,11 +5,11 @@
# the root directory of this source tree.
import re
from typing import Any, Dict, Optional
from llama_stack.apis.scoring import ScoringResultRow
from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
from llama_stack.apis.scoring import * # noqa: F401, F403
from llama_stack.apis.common.type_system import * # noqa: F403
from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
from .fn_defs.regex_parser_multiple_choice_answer import (
regex_parser_multiple_choice_answer,
@ -60,8 +60,3 @@ class RegexParserScoringFn(BaseScoringFn):
return {
"score": score,
}
async def aggregate(
self, scoring_results: List[ScoringResultRow]
) -> Dict[str, Any]:
return aggregate_accuracy(scoring_results)

View file

@ -4,11 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, Optional
from llama_stack.apis.scoring import ScoringResultRow
from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
from llama_stack.apis.scoring import * # noqa: F401, F403
from llama_stack.apis.common.type_system import * # noqa: F403
from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_accuracy
from .fn_defs.subset_of import subset_of
@ -36,8 +36,3 @@ class SubsetOfScoringFn(BaseScoringFn):
return {
"score": score,
}
async def aggregate(
self, scoring_results: List[ScoringResultRow]
) -> Dict[str, Any]:
return aggregate_accuracy(scoring_results)

View file

@ -147,7 +147,7 @@ class BraintrustScoringImpl(
await self.score_row(input_row, scoring_fn_id)
for input_row in input_rows
]
aggregation_functions = [AggregationFunctionType.average]
agg_results = aggregate_average(score_results)
res[scoring_fn_id] = ScoringResult(
score_rows=score_results,

View file

@ -120,7 +120,9 @@ class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
score_results = await scoring_fn.score(
input_rows, scoring_fn_id, scoring_fn_params
)
agg_results = await scoring_fn.aggregate(score_results)
agg_results = await scoring_fn.aggregate(
score_results, scoring_fn_id, scoring_fn_params
)
res[scoring_fn_id] = ScoringResult(
score_rows=score_results,
aggregated_results=agg_results,

View file

@ -3,13 +3,16 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
from typing import Any, Dict, Optional
from llama_stack.apis.inference.inference import Inference
from llama_stack.apis.scoring import ScoringResultRow
from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.providers.utils.scoring.base_scoring_fn import BaseScoringFn
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
from llama_stack.apis.scoring import * # noqa: F401, F403
from llama_stack.apis.common.type_system import * # noqa: F403
import re
from .fn_defs.llm_as_judge_405b_simpleqa import llm_as_judge_405b_simpleqa
@ -85,9 +88,3 @@ class LlmAsJudgeScoringFn(BaseScoringFn):
"score": judge_rating,
"judge_feedback": content,
}
async def aggregate(
self, scoring_results: List[ScoringResultRow]
) -> Dict[str, Any]:
# TODO: this needs to be config based aggregation, and only useful w/ Jobs API
return {}

View file

@ -7,7 +7,12 @@
import pytest
from llama_stack.apis.scoring_functions import * # noqa: F403
from llama_stack.apis.scoring_functions import (
AggregationFunctionType,
BasicScoringFnParams,
LLMAsJudgeScoringFnParams,
RegexParserScoringFnParams,
)
from llama_stack.distribution.datatypes import Api
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
@ -18,6 +23,11 @@ from llama_stack.providers.tests.datasetio.test_datasetio import register_datase
# -v -s --tb=short --disable-warnings
@pytest.fixture
def sample_judge_prompt_template():
return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
class TestScoring:
@pytest.mark.asyncio
async def test_scoring_functions_list(self, scoring_stack):
@ -92,7 +102,9 @@ class TestScoring:
assert len(response.results[x].score_rows) == 5
@pytest.mark.asyncio
async def test_scoring_score_with_params(self, scoring_stack):
async def test_scoring_score_with_params_llm_as_judge(
self, scoring_stack, sample_judge_prompt_template
):
(
scoring_impl,
scoring_functions_impl,
@ -129,10 +141,11 @@ class TestScoring:
assert len(rows.rows) == 3
scoring_functions = {
"llm-as-judge::llm_as_judge_base": LLMAsJudgeScoringFnParams(
"llm-as-judge::base": LLMAsJudgeScoringFnParams(
judge_model="Llama3.1-405B-Instruct",
prompt_template="Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9.",
prompt_template=sample_judge_prompt_template,
judge_score_regexes=[r"Score: (\d+)"],
aggregation_functions=[AggregationFunctionType.categorical_count],
)
}
@ -154,3 +167,67 @@ class TestScoring:
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == 5
@pytest.mark.asyncio
async def test_scoring_score_with_aggregation_functions(
self, scoring_stack, sample_judge_prompt_template
):
(
scoring_impl,
scoring_functions_impl,
datasetio_impl,
datasets_impl,
models_impl,
) = (
scoring_stack[Api.scoring],
scoring_stack[Api.scoring_functions],
scoring_stack[Api.datasetio],
scoring_stack[Api.datasets],
scoring_stack[Api.models],
)
await register_dataset(datasets_impl)
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
scoring_functions = {}
aggr_fns = [
AggregationFunctionType.accuracy,
AggregationFunctionType.median,
AggregationFunctionType.categorical_count,
AggregationFunctionType.average,
]
for x in scoring_fns_list:
if x.provider_id == "llm-as-judge":
aggr_fns = [AggregationFunctionType.categorical_count]
scoring_functions[x.identifier] = LLMAsJudgeScoringFnParams(
judge_model="Llama3.1-405B-Instruct",
prompt_template=sample_judge_prompt_template,
judge_score_regexes=[r"Score: (\d+)"],
aggregation_functions=aggr_fns,
)
elif x.provider_id == "basic":
if "regex_parser" in x.identifier:
scoring_functions[x.identifier] = RegexParserScoringFnParams(
aggregation_functions=aggr_fns,
)
else:
scoring_functions[x.identifier] = BasicScoringFnParams(
aggregation_functions=aggr_fns,
)
else:
scoring_functions[x.identifier] = None
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == len(rows.rows)
assert len(response.results[x].aggregated_results) == len(aggr_fns)

View file

@ -3,9 +3,10 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import statistics
from typing import Any, Dict, List
from llama_stack.apis.scoring import ScoringResultRow
from llama_stack.apis.scoring import AggregationFunctionType, ScoringResultRow
def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
@ -26,3 +27,38 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
)
/ len([_ for _ in scoring_results if _["score"] is not None]),
}
def aggregate_categorical_count(
scoring_results: List[ScoringResultRow],
) -> Dict[str, Any]:
scores = [str(r["score"]) for r in scoring_results]
unique_scores = sorted(list(set(scores)))
return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
scores = [r["score"] for r in scoring_results if r["score"] is not None]
median = statistics.median(scores) if scores else None
return {"median": median}
# TODO: decide whether we want to make aggregation functions as a registerable resource
AGGREGATION_FUNCTIONS = {
AggregationFunctionType.accuracy: aggregate_accuracy,
AggregationFunctionType.average: aggregate_average,
AggregationFunctionType.categorical_count: aggregate_categorical_count,
AggregationFunctionType.median: aggregate_median,
}
def aggregate_metrics(
scoring_results: List[ScoringResultRow], metrics: List[AggregationFunctionType]
) -> Dict[str, Any]:
agg_results = {}
for metric in metrics:
if metric not in AGGREGATION_FUNCTIONS:
raise ValueError(f"Aggregation function {metric} not found")
agg_fn = AGGREGATION_FUNCTIONS[metric]
agg_results[metric] = agg_fn(scoring_results)
return agg_results

View file

@ -8,11 +8,12 @@ from typing import Any, Dict, List, Optional
from llama_stack.apis.scoring import ScoringFnParams, ScoringResultRow
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
class BaseScoringFn(ABC):
"""
Base interface class for all meta-reference scoring_fns.
Base interface class for all native scoring_fns.
Each scoring_fn needs to implement the following methods:
- score_row(self, row)
- aggregate(self, scoring_fn_results)
@ -44,11 +45,27 @@ class BaseScoringFn(ABC):
) -> ScoringResultRow:
raise NotImplementedError()
@abstractmethod
async def aggregate(
self, scoring_results: List[ScoringResultRow]
self,
scoring_results: List[ScoringResultRow],
scoring_fn_identifier: Optional[str] = None,
scoring_params: Optional[ScoringFnParams] = None,
) -> Dict[str, Any]:
raise NotImplementedError()
params = self.supported_fn_defs_registry[scoring_fn_identifier].params
if scoring_params is not None:
if params is None:
params = scoring_params
else:
params.aggregation_functions = scoring_params.aggregation_functions
aggregation_functions = []
if (
params
and hasattr(params, "aggregation_functions")
and params.aggregation_functions
):
aggregation_functions.extend(params.aggregation_functions)
return aggregate_metrics(scoring_results, aggregation_functions)
async def score(
self,