wip add datatypes

2025-12-08 19:10:56 +00:00 · 2024-10-10 19:56:19 -07:00 · 2024-10-10 19:56:19 -07:00 · 9816c9aae6
commit 9816c9aae6
parent 99ed1425fc
5 changed files with 175 additions and 57 deletions
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Protocol
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Protocol

 from llama_models.schema_utils import webmethod
-
 from pydantic import BaseModel

 from llama_models.llama3.api.datatypes import *  # noqa: F403
@ -22,19 +22,26 @@ class EvaluationJobLogStream(BaseModel):
    job_uuid: str


-class EvaluateTaskConfig(BaseModel):
-    # num examples to evaluate, evaluate all if None
-    n_samples: Optional[int] = None
-    # model evaluation params
-    sampling_params: SamplingParams = SamplingParams()
+@json_schema_type
+class EvalResult(BaseModel):
+    """Evaluation result."""
+
+    metrics: Dict[str, str]
+
+
+@json_schema_type
+class SingleEvalResult(BaseModel):
+    """Single evaluation result."""
+
+    score_data: Dict[str, float]


@json_schema_type
 class EvaluateResponse(BaseModel):
    """Scores for evaluation."""

-    preprocess_output: GenerationOutput
-    metrics: Dict[str, str]
+    eval_result: EvalResult
+    formatted_report: Optional[str] = None


@json_schema_type
@ -56,6 +63,75 @@ class EvaluationJobCreateResponse(BaseModel):
    job_uuid: str


+@json_schema_type
+class EvaluateTaskConfig(BaseModel):
+    # num examples to evaluate, evaluate all if None
+    n_samples: Optional[int] = None
+    # model evaluation params
+    sampling_params: SamplingParams = SamplingParams()
+
+
+class BaseTask(
+    ABC,
+    Generic[
+        TDatasetSample,
+        TPreprocessedSample,
+        TPredictionSample,
+        TPostprocessedSample,
+        TSingleEvalResult,
+    ],
+):
+    """
+    A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
+    Base class for all evaluation tasks. Each task needs to implement the following methods:
+    - F1: preprocess_sample(self)
+    - F2: postprocess_sample(self)
+    - F3: score_sample(self)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self._name = self.__class__.__name__
+
+    @abstractmethod
+    def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def postprocess_sample(self, sample: TPredictionSample) -> TPostprocessedSample:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def score_sample(
+        self, sample: TPostprocessedSample, ground_truth: TPreprocessedSample
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        raise NotImplementedError()
+
+    def preprocess(
+        self, dataset: BaseDataset[TDatasetSample]
+    ) -> List[TPreprocessedSample]:
+        return [self.preprocess_sample(sample) for sample in self.dataset]
+
+    def postprocess(
+        self, generation: List[TPredictionSample]
+    ) -> List[TPostprocessedSample]:
+        return [self.postprocess_sample(sample) for sample in generation]
+
+    def score(
+        self,
+        postprocessed: List[TPostprocessedSample],
+        preprocessed_dataset: List[TPreprocessedSample],
+    ) -> List[TSingleEvalResult]:
+        return [
+            self.score_sample(sample, ground_truth)
+            for sample, ground_truth in zip(postprocessed, self.preprocessed_dataset)
+        ]
+
+
 class Evals(Protocol):
    @webmethod(route="/evals/run")
    async def run_evals(