scorer only api

This commit is contained in:
Xi Yan 2024-10-14 17:46:29 -07:00
parent a22c31b8a4
commit fcb8dea1ef
8 changed files with 184 additions and 27 deletions

View file

@ -92,8 +92,14 @@ class HuggingfaceDatasetDef(BaseModel):
identifier: str = Field( identifier: str = Field(
description="A unique name for the dataset", description="A unique name for the dataset",
) )
dataset_name: str = Field( dataset_path: str = Field(
description="The name of the dataset into HF (e.g. hellawag)", description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)",
)
dataset_name: Optional[str] = Field(
description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)",
)
rename_columns_map: Optional[Dict[str, str]] = Field(
description="A map of column names to rename to fit the schema of eval dataset for scoring",
) )
kwargs: Dict[str, Any] = Field( kwargs: Dict[str, Any] = Field(
description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",

View file

@ -51,34 +51,84 @@ class EvaluationClient(Evals):
response.raise_for_status() response.raise_for_status()
return EvaluateResponse(**response.json()) return EvaluateResponse(**response.json())
async def run_scorer(
self,
dataset_config: EvaluateDatasetConfig,
eval_scoring_config: EvaluateScoringConfig,
) -> EvaluateResponse:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{self.base_url}/evals/run_scorer",
json={
"dataset_config": json.loads(dataset_config.json()),
"eval_scoring_config": json.loads(eval_scoring_config.json()),
},
headers={"Content-Type": "application/json"},
timeout=3600,
)
response.raise_for_status()
return EvaluateResponse(**response.json())
async def run_main(host: str, port: int): async def run_main(host: str, port: int):
client = EvaluationClient(f"http://{host}:{port}") client = EvaluationClient(f"http://{host}:{port}")
dataset_client = DatasetsClient(f"http://{host}:{port}") dataset_client = DatasetsClient(f"http://{host}:{port}")
# Custom Eval Task # Full Eval Task
# 1. register custom dataset # # 1. register custom dataset
# response = await dataset_client.create_dataset(
# dataset_def=CustomDatasetDef(
# identifier="mmlu-simple-eval-en",
# url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
# ),
# )
# cprint(f"datasets/create: {response}", "cyan")
# # 2. run evals on the registered dataset
# response = await client.run_evals(
# model="Llama3.1-8B-Instruct",
# dataset="mmlu-simple-eval-en",
# task="mmlu",
# )
# if response.formatted_report:
# cprint(response.formatted_report, "green")
# else:
# cprint(f"Response: {response}", "green")
# Scoring Task
# 1. register huggingface dataset
response = await dataset_client.create_dataset( response = await dataset_client.create_dataset(
dataset_def=CustomDatasetDef( dataset_def=HuggingfaceDatasetDef(
identifier="mmlu-simple-eval-en", identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
), dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
rename_columns_map={
"output_parsed_answer": "generated_answer",
"input_correct_responses": "expected_answer",
},
kwargs={"split": "latest"},
)
) )
cprint(f"datasets/create: {response}", "cyan") cprint(response, "cyan")
# 2. run evals on the registered dataset # 2. run evals on the registered dataset
response = await client.run_evals( response = await client.run_scorer(
model="Llama3.1-8B-Instruct", dataset_config=EvaluateDatasetConfig(
dataset="mmlu-simple-eval-en", dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
task="mmlu", row_limit=10,
),
eval_scoring_config=EvaluateScoringConfig(
scorer_config_list=[
EvaluateSingleScorerConfig(scorer_name="accuracy"),
]
),
) )
if response.formatted_report: cprint(response, "green")
cprint(response.formatted_report, "green")
else:
cprint(f"Response: {response}", "green")
# Eleuther Eval Task # Eleuther Eval Task
# response = await client.run_evals( # response = await client.run_evals(

View file

@ -66,7 +66,7 @@ class EvaluationJobCreateResponse(BaseModel):
@json_schema_type @json_schema_type
class EvaluateDatasetConfig(BaseModel): class EvaluateDatasetConfig(BaseModel):
# identifier to previously registered dataset via DatasetDef # identifier to previously registered dataset via DatasetDef
dataset_name: str dataset_identifier: str
# limit number of rows to evaluate # limit number of rows to evaluate
row_limit: Optional[int] = None row_limit: Optional[int] = None
kwargs: Optional[Dict[str, Any]] = None kwargs: Optional[Dict[str, Any]] = None

View file

@ -72,7 +72,18 @@ class HuggingfaceDataset(BaseDataset[DictSample]):
self.load() self.load()
return len(self.dataset) return len(self.dataset)
def load(self): def load(self, n_samples: Optional[int] = None):
if self.dataset: if self.dataset:
return return
self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
if self.config.dataset_name:
self.config.kwargs["name"] = self.config.dataset_name
self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs)
if n_samples:
self.dataset = self.dataset.select(range(n_samples))
if self.config.rename_columns_map:
for k, v in self.config.rename_columns_map.items():
self.dataset = self.dataset.rename_column(k, v)

View file

@ -13,6 +13,7 @@ from llama_stack.apis.datasets import * # noqa: F403
from .config import MetaReferenceEvalsImplConfig from .config import MetaReferenceEvalsImplConfig
from .tasks.run_eval_task import RunEvalTask from .tasks.run_eval_task import RunEvalTask
from .tasks.run_scoring_task import RunScoringTask
class MetaReferenceEvalsImpl(Evals): class MetaReferenceEvalsImpl(Evals):
@ -44,7 +45,7 @@ class MetaReferenceEvalsImpl(Evals):
# construct eval task config from inputs # construct eval task config from inputs
eval_task_config = EvaluateTaskConfig( eval_task_config = EvaluateTaskConfig(
dataset_config=EvaluateDatasetConfig( dataset_config=EvaluateDatasetConfig(
dataset_name=dataset, dataset_identifier=dataset,
row_limit=3, row_limit=3,
), ),
processor_config=EvaluateProcessorConfig( processor_config=EvaluateProcessorConfig(
@ -76,8 +77,10 @@ class MetaReferenceEvalsImpl(Evals):
) -> EvaluateResponse: ) -> EvaluateResponse:
cprint("run_scorer") cprint("run_scorer")
# main logic, we need to convert the datset into List[ScorerInputSample] run_task = RunScoringTask()
eval_result = await run_task.run(dataset_config, eval_scoring_config)
return EvaluateResponse( return EvaluateResponse(
eval_result={}, eval_result=eval_result,
formatted_report=json.dumps(eval_result.json(), indent=4),
) )

View file

@ -31,9 +31,14 @@ class AccuracyScorer(BaseScorer[ScorerInputSample]):
extracted_answer = scorer_input_sample.generated_answer extracted_answer = scorer_input_sample.generated_answer
expected_answer = scorer_input_sample.expected_answer expected_answer = scorer_input_sample.expected_answer
accuracy = ( if isinstance(expected_answer, list):
1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 accuracy = (
) 1.0 if extracted_answer and extracted_answer in expected_answer else 0.0
)
else:
accuracy = (
1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
)
return SingleEvalResult(score_data={"accuracy": accuracy}) return SingleEvalResult(score_data={"accuracy": accuracy})

View file

@ -43,7 +43,9 @@ class RunEvalTask(BaseTask):
print(f"Running eval task w/ {eval_task_config}") print(f"Running eval task w/ {eval_task_config}")
print(DatasetRegistry.names()) print(DatasetRegistry.names())
dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name) dataset = DatasetRegistry.get(
eval_task_config.dataset_config.dataset_identifier
)
dataset.load(n_samples=eval_task_config.dataset_config.row_limit) dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
print(f"Running on {len(dataset)} samples") print(f"Running on {len(dataset)} samples")

View file

@ -0,0 +1,80 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.registry.datasets import DatasetRegistry
from llama_stack.distribution.registry.scorers import ScorerRegistry
from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403
from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
from llama_stack.apis.evals import * # noqa: F403
from llama_stack.apis.inference import * # noqa: F403
from termcolor import cprint
class RunScoringTask(BaseTask):
"""
RunScoringTask - only run scoring (F3) based on dataset and scoring config
"""
def __init__(
self,
*args,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
def transform_score_input_sample(
self, dataset: BaseDataset
) -> List[ScorerInputSample]:
scorer_inputs = []
for x in dataset:
expected_answer = x.data["expected_answer"]
generated_answer = x.data["generated_answer"]
scorer_inputs.append(
ScorerInputSample(
expected_answer=expected_answer,
generated_answer=generated_answer,
)
)
return scorer_inputs
async def run(
self,
dataset_config: EvaluateDatasetConfig,
eval_scoring_config: EvaluateScoringConfig,
*args,
**kwargs,
) -> EvalResult:
print(
f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}"
)
dataset = DatasetRegistry.get(dataset_config.dataset_identifier)
dataset.load(n_samples=dataset_config.row_limit)
print(f"Running on {len(dataset)} samples")
# transform dataset into
postprocessed = self.transform_score_input_sample(dataset)
cprint(postprocessed, "blue")
# F3 - scorer
scorer_config_list = eval_scoring_config.scorer_config_list
scorer_list = []
for s_conf in scorer_config_list:
scorer = ScorerRegistry.get(s_conf.scorer_name)
scorer_list.append(scorer())
scorer = AggregateScorer(
scorers=scorer_list,
)
scorer_results = scorer.score(postprocessed)
cprint(scorer_results, "magenta")
eval_result = scorer.aggregate_results(scorer_results)
return eval_result