From fdf251234ea7d11ced85ef188b9e7f910cb6b0c3 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Mar 2025 00:59:23 -0700
Subject: [PATCH] eval job

---
 llama_stack/apis/eval/eval.py | 97 ++++++++++++++---------------------
 1 file changed, 39 insertions(+), 58 deletions(-)

diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 2bd35497f..f5b766f21 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -63,10 +63,6 @@ class BenchmarkConfig(BaseModel):
         description="Map between scoring function id and parameters for each scoring function you want to run",
         default_factory=dict,
     )
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
     # we could optinally add any specific dataset config here
 
 
@@ -107,65 +103,50 @@ class Eval(Protocol):
         """Run an evaluation on a benchmark.
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
+        :param candidate: Candidate to evaluate on.
+            - {
+                "type": "model",
+                "model": "Llama-3.1-8B-Instruct",
+                "sampling_params": {...},
+                "system_message": "You are a helpful assistant.",
+            }
+            - {
+                "type": "agent",
+                "config": {...},
+            }
         :return: The job that was created to run the evaluation.
         """
 
-    # TODO: add these back in
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
-    # async def run_eval(
-    #     self,
-    #     benchmark_id: str,
-    #     benchmark_config: BenchmarkConfig,
-    # ) -> Job:
-    #     """Run an evaluation on a benchmark.
+    @webmethod(route="/eval/rows", method="POST")
+    async def evaluate_rows(
+        self,
+        dataset_rows: List[Dict[str, Any]],
+        scoring_functions: List[ScoringFnParams],
+        candidate: EvalCandidate,
+    ) -> EvaluateResponse:
+        """Evaluate a list of rows on a candidate.
 
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param benchmark_config: The configuration for the benchmark.
-    #     :return: The job that was created to run the evaluation.
-    #     """
+        :param dataset_rows: The rows to evaluate.
+        :param scoring_functions: The scoring functions to use for the evaluation.
+        :param candidate: The candidate to evaluate on.
+        :return: EvaluateResponse object containing generations and scores
+        """
 
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
-    # async def evaluate_rows(
-    #     self,
-    #     benchmark_id: str,
-    #     input_rows: List[Dict[str, Any]],
-    #     scoring_functions: List[str],
-    #     benchmark_config: BenchmarkConfig,
-    # ) -> EvaluateResponse:
-    #     """Evaluate a list of rows on a benchmark.
+    @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def get_job(self, benchmark_id: str, job_id: str) -> Optional[EvalJob]:
+        """Get the EvalJob object for a given job id and benchmark id.
 
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param input_rows: The rows to evaluate.
-    #     :param scoring_functions: The scoring functions to use for the evaluation.
-    #     :param benchmark_config: The configuration for the benchmark.
-    #     :return: EvaluateResponse object containing generations and scores
-    #     """
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to get the status of.
+        :return: EvalJob object indicating its status
+        """
+        ...
 
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    # async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
-    #     """Get the status of a job.
+    @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def cancel_job(self, benchmark_id: str, job_id: str) -> None:
+        """Cancel a job.
 
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to get the status of.
-    #     :return: The status of the evaluationjob.
-    #     """
-    #     ...
-
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
-    # async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
-    #     """Cancel a job.
-
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to cancel.
-    #     """
-    #     ...
-
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
-    # async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-    #     """Get the result of a job.
-
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to get the result of.
-    #     :return: The result of the job.
-    #     """
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to cancel.
+        """
+        ...