mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-10 04:08:31 +00:00
init commit
This commit is contained in:
parent
b56b06037c
commit
91ef7081d8
8 changed files with 3436 additions and 1 deletions
1
docs/_static/llama-stack-spec.html
vendored
1
docs/_static/llama-stack-spec.html
vendored
|
@ -6323,6 +6323,7 @@
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"average",
|
"average",
|
||||||
|
"weighted_average",
|
||||||
"median",
|
"median",
|
||||||
"categorical_count",
|
"categorical_count",
|
||||||
"accuracy"
|
"accuracy"
|
||||||
|
|
1
docs/_static/llama-stack-spec.yaml
vendored
1
docs/_static/llama-stack-spec.yaml
vendored
|
@ -4404,6 +4404,7 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- average
|
- average
|
||||||
|
- weighted_average
|
||||||
- median
|
- median
|
||||||
- categorical_count
|
- categorical_count
|
||||||
- accuracy
|
- accuracy
|
||||||
|
|
|
@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum):
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class AggregationFunctionType(Enum):
|
class AggregationFunctionType(Enum):
|
||||||
average = "average"
|
average = "average"
|
||||||
|
weighted_average = "weighted_average"
|
||||||
median = "median"
|
median = "median"
|
||||||
categorical_count = "categorical_count"
|
categorical_count = "categorical_count"
|
||||||
accuracy = "accuracy"
|
accuracy = "accuracy"
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.apis.common.type_system import NumberType
|
||||||
|
from llama_stack.apis.scoring_functions import (
|
||||||
|
AggregationFunctionType,
|
||||||
|
BasicScoringFnParams,
|
||||||
|
ScoringFn,
|
||||||
|
)
|
||||||
|
|
||||||
|
ifeval = ScoringFn(
|
||||||
|
identifier="basic::ifeval",
|
||||||
|
description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
|
||||||
|
return_type=NumberType(),
|
||||||
|
provider_id="basic",
|
||||||
|
provider_resource_id="ifeval",
|
||||||
|
params=BasicScoringFnParams(
|
||||||
|
aggregation_functions=[AggregationFunctionType.weighted_average],
|
||||||
|
),
|
||||||
|
)
|
|
@ -0,0 +1,80 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from llama_stack.apis.scoring import ScoringResultRow
|
||||||
|
from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
|
||||||
|
from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
|
||||||
|
|
||||||
|
from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
|
||||||
|
from .fn_defs.ifeval import (
|
||||||
|
ifeval,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class IfEvalScoringFn(RegisteredBaseScoringFn):
|
||||||
|
"""
|
||||||
|
A scoring_fn Instruction-Following Eval (IFEval) benchmark
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.supported_fn_defs_registry = {
|
||||||
|
ifeval.identifier: ifeval,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def score_row(
|
||||||
|
self,
|
||||||
|
input_row: Dict[str, Any],
|
||||||
|
scoring_fn_identifier: Optional[str] = None,
|
||||||
|
scoring_params: Optional[ScoringFnParams] = None,
|
||||||
|
) -> ScoringResultRow:
|
||||||
|
assert scoring_fn_identifier is not None, "Scoring function identifier not found."
|
||||||
|
fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
|
||||||
|
if scoring_params is not None:
|
||||||
|
fn_def.params = scoring_params
|
||||||
|
|
||||||
|
assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
|
||||||
|
f"RegexParserScoringFnParams not found for {fn_def}."
|
||||||
|
)
|
||||||
|
|
||||||
|
instruction_list = input_row["instruction_id_list"]
|
||||||
|
generated_answer = input_row["generated_answer"].strip()
|
||||||
|
|
||||||
|
is_following_list = []
|
||||||
|
results = dict(
|
||||||
|
{k + "_correct": 0.0 for k in INSTRUCTION_LIST},
|
||||||
|
**{k + "_total": 0.0 for k in INSTRUCTION_LIST},
|
||||||
|
)
|
||||||
|
|
||||||
|
for index, instruction_id in enumerate(instruction_list):
|
||||||
|
instruction_cls = INSTRUCTION_DICT[instruction_id]
|
||||||
|
instruction = instruction_cls(instruction_id)
|
||||||
|
results[instruction_id + "_total"] += 1.0
|
||||||
|
results[instruction_id.split(":")[0] + "_total"] += 1.0
|
||||||
|
|
||||||
|
instruction.build_description(**input_row["kwargs"][index])
|
||||||
|
args = instruction.get_instruction_args()
|
||||||
|
if args and "prompt" in args:
|
||||||
|
instruction.build_description(prompt=input_row["prompt"])
|
||||||
|
|
||||||
|
if generated_answer and instruction.check_following(generated_answer):
|
||||||
|
is_following_list.append(True)
|
||||||
|
results[instruction_id + "_correct"] += 1.0
|
||||||
|
results[instruction_id.split(":")[0] + "_correct"] += 1.0
|
||||||
|
else:
|
||||||
|
is_following_list.append(False)
|
||||||
|
|
||||||
|
if len(is_following_list) == 0:
|
||||||
|
return {
|
||||||
|
"score": 0.0,
|
||||||
|
"weight": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"score": float(sum(is_following_list)) / float(len(is_following_list)),
|
||||||
|
"weight": float(len(is_following_list)),
|
||||||
|
}
|
3318
llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
Normal file
3318
llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
|
||||||
InlineProviderSpec(
|
InlineProviderSpec(
|
||||||
api=Api.eval,
|
api=Api.eval,
|
||||||
provider_type="inline::meta-reference",
|
provider_type="inline::meta-reference",
|
||||||
pip_packages=["tree_sitter"],
|
pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"],
|
||||||
module="llama_stack.providers.inline.eval.meta_reference",
|
module="llama_stack.providers.inline.eval.meta_reference",
|
||||||
config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
|
config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
|
||||||
api_dependencies=[
|
api_dependencies=[
|
||||||
|
|
|
@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"weighted_average": sum(
|
||||||
|
result["score"] * result["weight"]
|
||||||
|
for result in scoring_results
|
||||||
|
if result["score"] is not None and result["weight"] is not None
|
||||||
|
)
|
||||||
|
/ sum(result["weight"] for result in scoring_results if result["weight"] is not None),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def aggregate_categorical_count(
|
def aggregate_categorical_count(
|
||||||
scoring_results: List[ScoringResultRow],
|
scoring_results: List[ScoringResultRow],
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue