mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-28 15:02:37 +00:00
cleanup original BaseTask
This commit is contained in:
parent
a25aff290e
commit
8890de7322
2 changed files with 0 additions and 93 deletions
|
@ -235,47 +235,6 @@ class BaseTask(ABC):
|
|||
raise NotImplementedError()
|
||||
|
||||
|
||||
# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]):
|
||||
# """
|
||||
# A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
|
||||
# Base class for all evaluation tasks. Each task needs to implement the following methods:
|
||||
# - F1: preprocess_sample(self)
|
||||
# - F2: postprocess_sample(self)
|
||||
# - F3: score_sample(self)
|
||||
# """
|
||||
|
||||
# def __init__(self, *args, **kwargs) -> None:
|
||||
# super().__init__(*args, **kwargs)
|
||||
# self._name = self.__class__.__name__
|
||||
|
||||
# @abstractmethod
|
||||
# def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample:
|
||||
# raise NotImplementedError()
|
||||
|
||||
# @abstractmethod
|
||||
# def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample:
|
||||
# raise NotImplementedError()
|
||||
|
||||
# @abstractmethod
|
||||
# def score_sample(self, sample: TProcessedSample) -> SingleEvalResult:
|
||||
# raise NotImplementedError()
|
||||
|
||||
# @abstractmethod
|
||||
# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def preprocess(
|
||||
# self, dataset: BaseDataset[TProcessedSample]
|
||||
# ) -> List[TProcessedSample]:
|
||||
# return [self.preprocess_sample(sample) for sample in dataset]
|
||||
|
||||
# def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]:
|
||||
# return [self.postprocess_sample(sample) for sample in generation]
|
||||
|
||||
# def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]:
|
||||
# return [self.score_sample(sample) for sample in postprocessed]
|
||||
|
||||
|
||||
class Evals(Protocol):
|
||||
|
||||
@webmethod(route="/evals/run_eval_task")
|
||||
|
|
|
@ -136,55 +136,3 @@ class MetaReferenceEvalsImpl(Evals):
|
|||
return EvaluateResponse(
|
||||
eval_result={},
|
||||
)
|
||||
|
||||
# async def run_evals(
|
||||
# self,
|
||||
# model: str,
|
||||
# task: str,
|
||||
# dataset: Optional[str] = None,
|
||||
# eval_task_config: Optional[EvaluateTaskConfig] = None,
|
||||
# ) -> EvaluateResponse:
|
||||
# cprint(
|
||||
# f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
|
||||
# "red",
|
||||
# )
|
||||
# if not dataset:
|
||||
# raise ValueError("dataset must be specified for mete-reference evals")
|
||||
|
||||
# dataset = DatasetRegistry.get_dataset(dataset)
|
||||
# dataset.load()
|
||||
|
||||
# task_impl = TaskRegistry.get_task(task)()
|
||||
# preprocessed = task_impl.preprocess(dataset)
|
||||
|
||||
# # TODO: replace w/ batch inference & async return eval job
|
||||
# generation_outputs = []
|
||||
# if eval_task_config is None:
|
||||
# eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
|
||||
# if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
|
||||
# preprocessed
|
||||
# ):
|
||||
# eval_task_config.n_samples = len(preprocessed)
|
||||
|
||||
# print(
|
||||
# f"Eval generation start, generate on {eval_task_config.n_samples} samples"
|
||||
# )
|
||||
|
||||
# for sample in preprocessed[: eval_task_config.n_samples]:
|
||||
# print("generation: ", sample)
|
||||
# response = await self.inference_api.chat_completion(
|
||||
# model=model,
|
||||
# messages=sample.preprocessed["messages"],
|
||||
# stream=False,
|
||||
# )
|
||||
# sample.prediction = PredictionSample(
|
||||
# completion_message=response.completion_message.content
|
||||
# )
|
||||
# generation_outputs.append(sample)
|
||||
|
||||
# postprocessed = task_impl.postprocess(generation_outputs)
|
||||
# eval_results = task_impl.score(postprocessed)
|
||||
# aggr_result = task_impl.aggregate_results(eval_results)
|
||||
# return EvaluateResponse(
|
||||
# eval_result=aggr_result,
|
||||
# )
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue