This commit is contained in:
Xi Yan 2025-03-12 00:59:23 -07:00
parent ea0b29ae50
commit fdf251234e

View file

@ -63,10 +63,6 @@ class BenchmarkConfig(BaseModel):
description="Map between scoring function id and parameters for each scoring function you want to run",
default_factory=dict,
)
num_examples: Optional[int] = Field(
description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
default=None,
)
# we could optinally add any specific dataset config here
@ -107,65 +103,50 @@ class Eval(Protocol):
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param benchmark_config: The configuration for the benchmark.
:param candidate: Candidate to evaluate on.
- {
"type": "model",
"model": "Llama-3.1-8B-Instruct",
"sampling_params": {...},
"system_message": "You are a helpful assistant.",
}
- {
"type": "agent",
"config": {...},
}
:return: The job that was created to run the evaluation.
"""
# TODO: add these back in
# @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
# async def run_eval(
# self,
# benchmark_id: str,
# benchmark_config: BenchmarkConfig,
# ) -> Job:
# """Run an evaluation on a benchmark.
@webmethod(route="/eval/rows", method="POST")
async def evaluate_rows(
self,
dataset_rows: List[Dict[str, Any]],
scoring_functions: List[ScoringFnParams],
candidate: EvalCandidate,
) -> EvaluateResponse:
"""Evaluate a list of rows on a candidate.
# :param benchmark_id: The ID of the benchmark to run the evaluation on.
# :param benchmark_config: The configuration for the benchmark.
# :return: The job that was created to run the evaluation.
# """
:param dataset_rows: The rows to evaluate.
:param scoring_functions: The scoring functions to use for the evaluation.
:param candidate: The candidate to evaluate on.
:return: EvaluateResponse object containing generations and scores
"""
# @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
# async def evaluate_rows(
# self,
# benchmark_id: str,
# input_rows: List[Dict[str, Any]],
# scoring_functions: List[str],
# benchmark_config: BenchmarkConfig,
# ) -> EvaluateResponse:
# """Evaluate a list of rows on a benchmark.
@webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="GET")
async def get_job(self, benchmark_id: str, job_id: str) -> Optional[EvalJob]:
"""Get the EvalJob object for a given job id and benchmark id.
# :param benchmark_id: The ID of the benchmark to run the evaluation on.
# :param input_rows: The rows to evaluate.
# :param scoring_functions: The scoring functions to use for the evaluation.
# :param benchmark_config: The configuration for the benchmark.
# :return: EvaluateResponse object containing generations and scores
# """
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the status of.
:return: EvalJob object indicating its status
"""
...
# @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
# async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
# """Get the status of a job.
@webmethod(route="/eval/benchmark/{benchmark_id}/jobs/{job_id}", method="DELETE")
async def cancel_job(self, benchmark_id: str, job_id: str) -> None:
"""Cancel a job.
# :param benchmark_id: The ID of the benchmark to run the evaluation on.
# :param job_id: The ID of the job to get the status of.
# :return: The status of the evaluationjob.
# """
# ...
# @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
# async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
# """Cancel a job.
# :param benchmark_id: The ID of the benchmark to run the evaluation on.
# :param job_id: The ID of the job to cancel.
# """
# ...
# @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
# async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
# """Get the result of a job.
# :param benchmark_id: The ID of the benchmark to run the evaluation on.
# :param job_id: The ID of the job to get the result of.
# :return: The result of the job.
# """
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to cancel.
"""
...