mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-30 07:39:38 +00:00
Merge branch 'evals_8' into evals_9
This commit is contained in:
commit
0dad0d0d67
11 changed files with 93 additions and 47 deletions
|
@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
||||||
from llama_stack.apis.scoring import * # noqa: F401, F403
|
from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
class BaseScoringFn(ABC):
|
class BaseScoringFn(ABC):
|
||||||
|
@ -21,16 +20,11 @@ class BaseScoringFn(ABC):
|
||||||
def __init__(self, *args, **kwargs) -> None:
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.supported_fn_defs_registry = {}
|
self.supported_fn_defs_registry = {}
|
||||||
self.defs_paths = []
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None: ...
|
||||||
for f in self.defs_paths:
|
|
||||||
with open(f, "r") as f:
|
|
||||||
scoring_fn_def = ScoringFnDef(**json.load(f))
|
|
||||||
self.register_scoring_fn_def(scoring_fn_def)
|
|
||||||
|
|
||||||
def get_supported_scoring_fn_defs(self) -> List[ScoringFnDef]:
|
def get_supported_scoring_fn_defs(self) -> List[ScoringFnDef]:
|
||||||
return [x for x in self.supported_fn_defs_registry.values()]
|
return [x for x in self.supported_fn_defs_registry.values()]
|
||||||
|
|
|
@ -13,7 +13,10 @@ from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
|
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
||||||
aggregate_accuracy,
|
aggregate_accuracy,
|
||||||
FN_DEFS_PATH,
|
)
|
||||||
|
|
||||||
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.fn_defs.equality import (
|
||||||
|
equality_fn_def,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +27,9 @@ class EqualityScoringFn(BaseScoringFn):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs) -> None:
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.defs_paths = [FN_DEFS_PATH / "equality.json"]
|
self.supported_fn_defs_registry = {
|
||||||
|
equality_fn_def.identifier: equality_fn_def,
|
||||||
|
}
|
||||||
|
|
||||||
async def score_row(
|
async def score_row(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
|
@ -1,10 +0,0 @@
|
||||||
{
|
|
||||||
"identifier": "meta-reference::equality",
|
|
||||||
"description": "Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
|
|
||||||
"metadata": {},
|
|
||||||
"parameters": [],
|
|
||||||
"return_type": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"context": null
|
|
||||||
}
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.apis.common.type_system import NumberType
|
||||||
|
from llama_stack.apis.scoring_functions import ScoringFnDef
|
||||||
|
|
||||||
|
|
||||||
|
equality_fn_def = ScoringFnDef(
|
||||||
|
identifier="meta-reference::equality",
|
||||||
|
description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
|
||||||
|
parameters=[],
|
||||||
|
return_type=NumberType(),
|
||||||
|
)
|
|
@ -1,14 +0,0 @@
|
||||||
{
|
|
||||||
"identifier": "meta-reference::llm_as_judge_8b_correctness",
|
|
||||||
"description": "Llm As Judge Scoring Function",
|
|
||||||
"metadata": {},
|
|
||||||
"parameters": [],
|
|
||||||
"return_type": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"context": {
|
|
||||||
"judge_model": "Llama3.1-8B-Instruct",
|
|
||||||
"prompt_template": "\nYou will be given a question, a expected_answer, and a system_answer.\nYour task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.\nGive your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.\nProvide your feedback as follows:\nFeedback:::\nTotal rating: (your rating, as a int between 0 and 5)\nNow here are the question, expected_answer, system_answer.\nQuestion: {input_query}\nExpected Answer: {expected_answer}\nSystem Answer: {generated_answer}\nFeedback:::\nTotal rating:\n",
|
|
||||||
"judge_score_regex": ["Total rating: (\\d+)", "rating: (\\d+)", "Rating: (\\d+)"]
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
||||||
|
from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
|
from llama_stack.apis.common.type_system import NumberType
|
||||||
|
|
||||||
|
JUDGE_PROMPT = """
|
||||||
|
You will be given a question, a expected_answer, and a system_answer.
|
||||||
|
Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
|
||||||
|
Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
|
||||||
|
Provide your feedback as follows:
|
||||||
|
Feedback:::
|
||||||
|
Total rating: (your rating, as a int between 0 and 5)
|
||||||
|
Now here are the question, expected_answer, system_answer.
|
||||||
|
Question: {input_query}
|
||||||
|
Expected Answer: {expected_answer}
|
||||||
|
System Answer: {generated_answer}
|
||||||
|
Feedback:::
|
||||||
|
Total rating:
|
||||||
|
"""
|
||||||
|
llm_as_judge_8b_correctness_fn_def = ScoringFnDef(
|
||||||
|
identifier="meta-reference::llm_as_judge_8b_correctness",
|
||||||
|
description="Llm As Judge Scoring Function",
|
||||||
|
parameters=[],
|
||||||
|
return_type=NumberType(),
|
||||||
|
context=LLMAsJudgeContext(
|
||||||
|
prompt_template=JUDGE_PROMPT,
|
||||||
|
judge_model="Llama3.1-8B-Instruct",
|
||||||
|
judge_score_regex=[r"Total rating: (\d+)", r"rating: (\d+)", r"Rating: (\d+)"],
|
||||||
|
),
|
||||||
|
)
|
|
@ -1,10 +0,0 @@
|
||||||
{
|
|
||||||
"identifier": "meta-reference::subset_of",
|
|
||||||
"description": "Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
|
|
||||||
"metadata": {},
|
|
||||||
"parameters": [],
|
|
||||||
"return_type": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"context": null
|
|
||||||
}
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.apis.common.type_system import NumberType
|
||||||
|
from llama_stack.apis.scoring_functions import ScoringFnDef
|
||||||
|
|
||||||
|
|
||||||
|
subset_of_fn_def = ScoringFnDef(
|
||||||
|
identifier="meta-reference::subset_of",
|
||||||
|
description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
|
||||||
|
parameters=[],
|
||||||
|
return_type=NumberType(),
|
||||||
|
)
|
|
@ -14,7 +14,9 @@ import re
|
||||||
|
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
||||||
aggregate_average,
|
aggregate_average,
|
||||||
FN_DEFS_PATH,
|
)
|
||||||
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.fn_defs.llm_as_judge_8b_correctness import (
|
||||||
|
llm_as_judge_8b_correctness_fn_def,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,7 +28,9 @@ class LlmAsJudgeScoringFn(BaseScoringFn):
|
||||||
def __init__(self, inference_api: Inference, *arg, **kwargs) -> None:
|
def __init__(self, inference_api: Inference, *arg, **kwargs) -> None:
|
||||||
super().__init__(*arg, **kwargs)
|
super().__init__(*arg, **kwargs)
|
||||||
self.inference_api = inference_api
|
self.inference_api = inference_api
|
||||||
self.defs_paths = [FN_DEFS_PATH / "llm_as_judge_8b_correctness.json"]
|
self.supported_fn_defs_registry = {
|
||||||
|
llm_as_judge_8b_correctness_fn_def.identifier: llm_as_judge_8b_correctness_fn_def,
|
||||||
|
}
|
||||||
|
|
||||||
async def score_row(
|
async def score_row(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -12,7 +12,10 @@ from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
||||||
aggregate_accuracy,
|
aggregate_accuracy,
|
||||||
FN_DEFS_PATH,
|
)
|
||||||
|
|
||||||
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.fn_defs.subset_of import (
|
||||||
|
subset_of_fn_def,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,7 +26,9 @@ class SubsetOfScoringFn(BaseScoringFn):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs) -> None:
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.defs_paths = [FN_DEFS_PATH / "subset_of.json"]
|
self.supported_fn_defs_registry = {
|
||||||
|
subset_of_fn_def.identifier: subset_of_fn_def,
|
||||||
|
}
|
||||||
|
|
||||||
async def score_row(
|
async def score_row(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue