separate benchmark / app eval

This commit is contained in:
Xi Yan 2024-11-05 16:54:31 -08:00
parent 979cd4cd44
commit 4a64f98c82
2 changed files with 10 additions and 18 deletions

View file

@ -38,47 +38,39 @@ EvalCandidate = Annotated[
@json_schema_type
class BenchmarkEvalTaskConfig(BaseModel):
type: Literal["benchmark"] = "benchmark"
eval_candidate: EvalCandidate # type: ignore
@json_schema_type
class AppEvalTaskConfig(BaseModel):
type: Literal["app"] = "app"
eval_candidate: EvalCandidate # type: ignore
scoring_functions_params: Dict[str, ScoringFnParams] = Field( # type: ignore
description="Map between scoring function id and parameters",
default_factory=dict,
)
# we could optinally add any GenEval specific dataset config here
EvalTaskConfig = Annotated[
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
]
# we could optinally add any specific dataset config here
@json_schema_type
class EvaluateResponse(BaseModel):
generations: List[Dict[str, Any]]
# each key in the dict is a scoring function name
scores: Dict[str, ScoringResult]
class Eval(Protocol):
@webmethod(route="/eval/evaluate_batch", method="POST")
async def evaluate_task(
@webmethod(route="/eval/run_benchmark", method="POST")
async def run_benchmark(
self,
eval_task_id: str,
eval_task_config: EvalTaskConfig, # type: ignore
benchmark_id: str,
eval_task_config: BenchmarkEvalTaskConfig, # type: ignore
) -> Job: ...
@webmethod(route="/eval/evaluate_batch", method="POST")
async def evaluate_batch(
@webmethod(route="/eval/run_app_eval", method="POST")
async def run_app_eval(
self,
eval_task_def: Union[str, EvalTaskDef], # type: ignore
eval_task_config: EvalTaskConfig, # type: ignore
eval_task_def: EvalTaskDef, # type: ignore
eval_task_config: AppEvalTaskConfig, # type: ignore
) -> Job: ...
@webmethod(route="/eval/evaluate", method="POST")

View file

@ -17,7 +17,7 @@ class EvalTaskDef(BaseModel):
scoring_functions: List[str]
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task (e.g. from GECO)",
description="Metadata for this evaluation task",
)