eval job

2025-12-31 07:59:59 +00:00 · 2025-03-12 00:59:23 -07:00 · 2025-03-12 00:59:23 -07:00 · fdf251234e
commit fdf251234e
parent ea0b29ae50
1 changed files with 39 additions and 58 deletions
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -63,10 +63,6 @@ class BenchmarkConfig(BaseModel):
        description="Map between scoring function id and parameters for each scoring function you want to run",
        default_factory=dict,
    )
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
    # we could optinally add any specific dataset config here


@ -107,65 +103,50 @@ class Eval(Protocol):
        """Run an evaluation on a benchmark.

        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
+        :param candidate: Candidate to evaluate on.
+            - {
+                "type": "model",
+                "model": "Llama-3.1-8B-Instruct",
+                "sampling_params": {...},
+                "system_message": "You are a helpful assistant.",
+            }
+            - {
+                "type": "agent",
+                "config": {...},
+            }
        :return: The job that was created to run the evaluation.
        """

-    # TODO: add these back in
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
-    # async def run_eval(
-    #     self,
-    #     benchmark_id: str,
-    #     benchmark_config: BenchmarkConfig,
-    # ) -> Job:
-    #     """Run an evaluation on a benchmark.
+    @webmethod(route="/eval/rows", method="POST")
+    async def evaluate_rows(
+        self,
+        dataset_rows: List[Dict[str, Any]],
+        scoring_functions: List[ScoringFnParams],
+        candidate: EvalCandidate,
+    ) -> EvaluateResponse:
+        """Evaluate a list of rows on a candidate.

-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param benchmark_config: The configuration for the benchmark.
-    #     :return: The job that was created to run the evaluation.
-    #     """
+        :param dataset_rows: The rows to evaluate.
+        :param scoring_functions: The scoring functions to use for the evaluation.
+        :param candidate: The candidate to evaluate on.
+        :return: EvaluateResponse object containing generations and scores
+        """

-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
-    # async def evaluate_rows(
-    #     self,
-    #     benchmark_id: str,
-    #     input_rows: List[Dict[str, Any]],
-    #     scoring_functions: List[str],
-    #     benchmark_config: BenchmarkConfig,
-    # ) -> EvaluateResponse:
-    #     """Evaluate a list of rows on a benchmark.
+    @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def get_job(self, benchmark_id: str, job_id: str) -> Optional[EvalJob]:
+        """Get the EvalJob object for a given job id and benchmark id.

-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param input_rows: The rows to evaluate.
-    #     :param scoring_functions: The scoring functions to use for the evaluation.
-    #     :param benchmark_config: The configuration for the benchmark.
-    #     :return: EvaluateResponse object containing generations and scores
-    #     """
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to get the status of.
+        :return: EvalJob object indicating its status
+        """
+        ...

-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    # async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
-    #     """Get the status of a job.
+    @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def cancel_job(self, benchmark_id: str, job_id: str) -> None:
+        """Cancel a job.

-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to get the status of.
-    #     :return: The status of the evaluationjob.
-    #     """
-    #     ...
-
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
-    # async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
-    #     """Cancel a job.
-
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to cancel.
-    #     """
-    #     ...
-
-    # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
-    # async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-    #     """Get the result of a job.
-
-    #     :param benchmark_id: The ID of the benchmark to run the evaluation on.
-    #     :param job_id: The ID of the job to get the result of.
-    #     :return: The result of the job.
-    #     """
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to cancel.
+        """
+        ...