diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 92657f6b5..098fa5cc4 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -235,47 +235,6 @@ class BaseTask(ABC): raise NotImplementedError() -# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]): -# """ -# A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. -# Base class for all evaluation tasks. Each task needs to implement the following methods: -# - F1: preprocess_sample(self) -# - F2: postprocess_sample(self) -# - F3: score_sample(self) -# """ - -# def __init__(self, *args, **kwargs) -> None: -# super().__init__(*args, **kwargs) -# self._name = self.__class__.__name__ - -# @abstractmethod -# def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample: -# raise NotImplementedError() - -# @abstractmethod -# def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample: -# raise NotImplementedError() - -# @abstractmethod -# def score_sample(self, sample: TProcessedSample) -> SingleEvalResult: -# raise NotImplementedError() - -# @abstractmethod -# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: -# raise NotImplementedError() - -# def preprocess( -# self, dataset: BaseDataset[TProcessedSample] -# ) -> List[TProcessedSample]: -# return [self.preprocess_sample(sample) for sample in dataset] - -# def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]: -# return [self.postprocess_sample(sample) for sample in generation] - -# def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]: -# return [self.score_sample(sample) for sample in postprocessed] - - class Evals(Protocol): @webmethod(route="/evals/run_eval_task") diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 0fbce823e..411aa0bc2 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -136,55 +136,3 @@ class MetaReferenceEvalsImpl(Evals): return EvaluateResponse( eval_result={}, ) - - # async def run_evals( - # self, - # model: str, - # task: str, - # dataset: Optional[str] = None, - # eval_task_config: Optional[EvaluateTaskConfig] = None, - # ) -> EvaluateResponse: - # cprint( - # f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", - # "red", - # ) - # if not dataset: - # raise ValueError("dataset must be specified for mete-reference evals") - - # dataset = DatasetRegistry.get_dataset(dataset) - # dataset.load() - - # task_impl = TaskRegistry.get_task(task)() - # preprocessed = task_impl.preprocess(dataset) - - # # TODO: replace w/ batch inference & async return eval job - # generation_outputs = [] - # if eval_task_config is None: - # eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed)) - # if eval_task_config.n_samples is None or eval_task_config.n_samples > len( - # preprocessed - # ): - # eval_task_config.n_samples = len(preprocessed) - - # print( - # f"Eval generation start, generate on {eval_task_config.n_samples} samples" - # ) - - # for sample in preprocessed[: eval_task_config.n_samples]: - # print("generation: ", sample) - # response = await self.inference_api.chat_completion( - # model=model, - # messages=sample.preprocessed["messages"], - # stream=False, - # ) - # sample.prediction = PredictionSample( - # completion_message=response.completion_message.content - # ) - # generation_outputs.append(sample) - - # postprocessed = task_impl.postprocess(generation_outputs) - # eval_results = task_impl.score(postprocessed) - # aggr_result = task_impl.aggregate_results(eval_results) - # return EvaluateResponse( - # eval_result=aggr_result, - # )