mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-30 07:39:38 +00:00
separate benchmark / app eval
This commit is contained in:
parent
979cd4cd44
commit
4a64f98c82
2 changed files with 10 additions and 18 deletions
|
@ -38,47 +38,39 @@ EvalCandidate = Annotated[
|
|||
|
||||
@json_schema_type
|
||||
class BenchmarkEvalTaskConfig(BaseModel):
|
||||
type: Literal["benchmark"] = "benchmark"
|
||||
eval_candidate: EvalCandidate # type: ignore
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AppEvalTaskConfig(BaseModel):
|
||||
type: Literal["app"] = "app"
|
||||
eval_candidate: EvalCandidate # type: ignore
|
||||
scoring_functions_params: Dict[str, ScoringFnParams] = Field( # type: ignore
|
||||
description="Map between scoring function id and parameters",
|
||||
default_factory=dict,
|
||||
)
|
||||
# we could optinally add any GenEval specific dataset config here
|
||||
|
||||
|
||||
EvalTaskConfig = Annotated[
|
||||
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
|
||||
]
|
||||
# we could optinally add any specific dataset config here
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateResponse(BaseModel):
|
||||
generations: List[Dict[str, Any]]
|
||||
|
||||
# each key in the dict is a scoring function name
|
||||
scores: Dict[str, ScoringResult]
|
||||
|
||||
|
||||
class Eval(Protocol):
|
||||
@webmethod(route="/eval/evaluate_batch", method="POST")
|
||||
async def evaluate_task(
|
||||
@webmethod(route="/eval/run_benchmark", method="POST")
|
||||
async def run_benchmark(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
eval_task_config: EvalTaskConfig, # type: ignore
|
||||
benchmark_id: str,
|
||||
eval_task_config: BenchmarkEvalTaskConfig, # type: ignore
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/evaluate_batch", method="POST")
|
||||
async def evaluate_batch(
|
||||
@webmethod(route="/eval/run_app_eval", method="POST")
|
||||
async def run_app_eval(
|
||||
self,
|
||||
eval_task_def: Union[str, EvalTaskDef], # type: ignore
|
||||
eval_task_config: EvalTaskConfig, # type: ignore
|
||||
eval_task_def: EvalTaskDef, # type: ignore
|
||||
eval_task_config: AppEvalTaskConfig, # type: ignore
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/evaluate", method="POST")
|
||||
|
|
|
@ -17,7 +17,7 @@ class EvalTaskDef(BaseModel):
|
|||
scoring_functions: List[str]
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Metadata for this evaluation task (e.g. from GECO)",
|
||||
description="Metadata for this evaluation task",
|
||||
)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue