mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-30 23:51:00 +00:00
separate benchmark / app eval
This commit is contained in:
parent
979cd4cd44
commit
4a64f98c82
2 changed files with 10 additions and 18 deletions
|
@ -38,47 +38,39 @@ EvalCandidate = Annotated[
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BenchmarkEvalTaskConfig(BaseModel):
|
class BenchmarkEvalTaskConfig(BaseModel):
|
||||||
type: Literal["benchmark"] = "benchmark"
|
|
||||||
eval_candidate: EvalCandidate # type: ignore
|
eval_candidate: EvalCandidate # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class AppEvalTaskConfig(BaseModel):
|
class AppEvalTaskConfig(BaseModel):
|
||||||
type: Literal["app"] = "app"
|
|
||||||
eval_candidate: EvalCandidate # type: ignore
|
eval_candidate: EvalCandidate # type: ignore
|
||||||
scoring_functions_params: Dict[str, ScoringFnParams] = Field( # type: ignore
|
scoring_functions_params: Dict[str, ScoringFnParams] = Field( # type: ignore
|
||||||
description="Map between scoring function id and parameters",
|
description="Map between scoring function id and parameters",
|
||||||
default_factory=dict,
|
default_factory=dict,
|
||||||
)
|
)
|
||||||
# we could optinally add any GenEval specific dataset config here
|
# we could optinally add any specific dataset config here
|
||||||
|
|
||||||
|
|
||||||
EvalTaskConfig = Annotated[
|
|
||||||
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class EvaluateResponse(BaseModel):
|
class EvaluateResponse(BaseModel):
|
||||||
generations: List[Dict[str, Any]]
|
generations: List[Dict[str, Any]]
|
||||||
|
|
||||||
# each key in the dict is a scoring function name
|
# each key in the dict is a scoring function name
|
||||||
scores: Dict[str, ScoringResult]
|
scores: Dict[str, ScoringResult]
|
||||||
|
|
||||||
|
|
||||||
class Eval(Protocol):
|
class Eval(Protocol):
|
||||||
@webmethod(route="/eval/evaluate_batch", method="POST")
|
@webmethod(route="/eval/run_benchmark", method="POST")
|
||||||
async def evaluate_task(
|
async def run_benchmark(
|
||||||
self,
|
self,
|
||||||
eval_task_id: str,
|
benchmark_id: str,
|
||||||
eval_task_config: EvalTaskConfig, # type: ignore
|
eval_task_config: BenchmarkEvalTaskConfig, # type: ignore
|
||||||
) -> Job: ...
|
) -> Job: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/evaluate_batch", method="POST")
|
@webmethod(route="/eval/run_app_eval", method="POST")
|
||||||
async def evaluate_batch(
|
async def run_app_eval(
|
||||||
self,
|
self,
|
||||||
eval_task_def: Union[str, EvalTaskDef], # type: ignore
|
eval_task_def: EvalTaskDef, # type: ignore
|
||||||
eval_task_config: EvalTaskConfig, # type: ignore
|
eval_task_config: AppEvalTaskConfig, # type: ignore
|
||||||
) -> Job: ...
|
) -> Job: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/evaluate", method="POST")
|
@webmethod(route="/eval/evaluate", method="POST")
|
||||||
|
|
|
@ -17,7 +17,7 @@ class EvalTaskDef(BaseModel):
|
||||||
scoring_functions: List[str]
|
scoring_functions: List[str]
|
||||||
metadata: Dict[str, Any] = Field(
|
metadata: Dict[str, Any] = Field(
|
||||||
default_factory=dict,
|
default_factory=dict,
|
||||||
description="Metadata for this evaluation task (e.g. from GECO)",
|
description="Metadata for this evaluation task",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue