init commit

2026-01-03 03:42:15 +00:00 · 2025-03-16 23:16:11 -07:00 · 2025-03-16 23:16:11 -07:00 · 91ef7081d8
commit 91ef7081d8
parent b56b06037c
8 changed files with 3436 additions and 1 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6323,6 +6323,7 @@
                "type": "string",
                "enum": [
                    "average",
                    "weighted_average",
                    "median",
                    "categorical_count",
                    "accuracy"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4404,6 +4404,7 @@ components:
      type: string
      enum:
        - average
        - weighted_average
        - median
        - categorical_count
        - accuracy
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum):
@json_schema_type
 class AggregationFunctionType(Enum):
    average = "average"
    weighted_average = "weighted_average"
    median = "median"
    categorical_count = "categorical_count"
    accuracy = "accuracy"
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -0,0 +1,23 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.common.type_system import NumberType
 from llama_stack.apis.scoring_functions import (
    AggregationFunctionType,
    BasicScoringFnParams,
    ScoringFn,
 )
 ifeval = ScoringFn(
    identifier="basic::ifeval",
    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
    return_type=NumberType(),
    provider_id="basic",
    provider_resource_id="ifeval",
    params=BasicScoringFnParams(
        aggregation_functions=[AggregationFunctionType.weighted_average],
    ),
 )
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -0,0 +1,80 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, Optional
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
 from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
 from .fn_defs.ifeval import (
    ifeval,
 )
 class IfEvalScoringFn(RegisteredBaseScoringFn):
    """
    A scoring_fn Instruction-Following Eval (IFEval) benchmark
    """
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.supported_fn_defs_registry = {
            ifeval.identifier: ifeval,
        }
    async def score_row(
        self,
        input_row: Dict[str, Any],
        scoring_fn_identifier: Optional[str] = None,
        scoring_params: Optional[ScoringFnParams] = None,
    ) -> ScoringResultRow:
        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
        if scoring_params is not None:
            fn_def.params = scoring_params
        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
            f"RegexParserScoringFnParams not found for {fn_def}."
        )
        instruction_list = input_row["instruction_id_list"]
        generated_answer = input_row["generated_answer"].strip()
        is_following_list = []
        results = dict(
            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
        )
        for index, instruction_id in enumerate(instruction_list):
            instruction_cls = INSTRUCTION_DICT[instruction_id]
            instruction = instruction_cls(instruction_id)
            results[instruction_id + "_total"] += 1.0
            results[instruction_id.split(":")[0] + "_total"] += 1.0
            instruction.build_description(**input_row["kwargs"][index])
            args = instruction.get_instruction_args()
            if args and "prompt" in args:
                instruction.build_description(prompt=input_row["prompt"])
            if generated_answer and instruction.check_following(generated_answer):
                is_following_list.append(True)
                results[instruction_id + "_correct"] += 1.0
                results[instruction_id.split(":")[0] + "_correct"] += 1.0
            else:
                is_following_list.append(False)
        if len(is_following_list) == 0:
            return {
                "score": 0.0,
                "weight": 0.0,
            }
        return {
            "score": float(sum(is_following_list)) / float(len(is_following_list)),
            "weight": float(len(is_following_list)),
        }
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
        InlineProviderSpec(
            api=Api.eval,
            provider_type="inline::meta-reference",
-            pip_packages=["tree_sitter"],
+            pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"],
            module="llama_stack.providers.inline.eval.meta_reference",
            config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
            api_dependencies=[
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
    }
 def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
    return {
        "weighted_average": sum(
            result["score"] * result["weight"]
            for result in scoring_results
            if result["score"] is not None and result["weight"] is not None
        )
        / sum(result["weight"] for result in scoring_results if result["weight"] is not None),
    }
 def aggregate_categorical_count(
    scoring_results: List[ScoringResultRow],
 ) -> Dict[str, Any]: