From fdf251234ea7d11ced85ef188b9e7f910cb6b0c3 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 00:59:23 -0700 Subject: [PATCH] eval job --- llama_stack/apis/eval/eval.py | 97 ++++++++++++++--------------------- 1 file changed, 39 insertions(+), 58 deletions(-) diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 2bd35497f..f5b766f21 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -63,10 +63,6 @@ class BenchmarkConfig(BaseModel): description="Map between scoring function id and parameters for each scoring function you want to run", default_factory=dict, ) - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) # we could optinally add any specific dataset config here @@ -107,65 +103,50 @@ class Eval(Protocol): """Run an evaluation on a benchmark. :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param benchmark_config: The configuration for the benchmark. + :param candidate: Candidate to evaluate on. + - { + "type": "model", + "model": "Llama-3.1-8B-Instruct", + "sampling_params": {...}, + "system_message": "You are a helpful assistant.", + } + - { + "type": "agent", + "config": {...}, + } :return: The job that was created to run the evaluation. """ - # TODO: add these back in - # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") - # async def run_eval( - # self, - # benchmark_id: str, - # benchmark_config: BenchmarkConfig, - # ) -> Job: - # """Run an evaluation on a benchmark. + @webmethod(route="/eval/rows", method="POST") + async def evaluate_rows( + self, + dataset_rows: List[Dict[str, Any]], + scoring_functions: List[ScoringFnParams], + candidate: EvalCandidate, + ) -> EvaluateResponse: + """Evaluate a list of rows on a candidate. - # :param benchmark_id: The ID of the benchmark to run the evaluation on. - # :param benchmark_config: The configuration for the benchmark. - # :return: The job that was created to run the evaluation. - # """ + :param dataset_rows: The rows to evaluate. + :param scoring_functions: The scoring functions to use for the evaluation. + :param candidate: The candidate to evaluate on. + :return: EvaluateResponse object containing generations and scores + """ - # @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") - # async def evaluate_rows( - # self, - # benchmark_id: str, - # input_rows: List[Dict[str, Any]], - # scoring_functions: List[str], - # benchmark_config: BenchmarkConfig, - # ) -> EvaluateResponse: - # """Evaluate a list of rows on a benchmark. + @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="GET") + async def get_job(self, benchmark_id: str, job_id: str) -> Optional[EvalJob]: + """Get the EvalJob object for a given job id and benchmark id. - # :param benchmark_id: The ID of the benchmark to run the evaluation on. - # :param input_rows: The rows to evaluate. - # :param scoring_functions: The scoring functions to use for the evaluation. - # :param benchmark_config: The configuration for the benchmark. - # :return: EvaluateResponse object containing generations and scores - # """ + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param job_id: The ID of the job to get the status of. + :return: EvalJob object indicating its status + """ + ... - # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") - # async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: - # """Get the status of a job. + @webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="DELETE") + async def cancel_job(self, benchmark_id: str, job_id: str) -> None: + """Cancel a job. - # :param benchmark_id: The ID of the benchmark to run the evaluation on. - # :param job_id: The ID of the job to get the status of. - # :return: The status of the evaluationjob. - # """ - # ... - - # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") - # async def job_cancel(self, benchmark_id: str, job_id: str) -> None: - # """Cancel a job. - - # :param benchmark_id: The ID of the benchmark to run the evaluation on. - # :param job_id: The ID of the job to cancel. - # """ - # ... - - # @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") - # async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: - # """Get the result of a job. - - # :param benchmark_id: The ID of the benchmark to run the evaluation on. - # :param job_id: The ID of the job to get the result of. - # :return: The result of the job. - # """ + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param job_id: The ID of the job to cancel. + """ + ...