[Evals API][10/n] API updates for EvalTaskDef + new test migration (#379)

* wip

* scoring fn api

* eval api

* eval task

* evaluate api update

* pre commit

* unwrap context -> config

* config field doc

* typo

* naming fix

* separate benchmark / app eval

* api name

* rename

* wip tests

* wip

* datasetio test

* delete unused

* fixture

* scoring resolve

* fix scoring register

* scoring test pass

* score batch

* scoring fix

* fix eval

* test eval works

* remove type ignore

* api refactor

* add default task_eval_id for routing

* add eval_id for jobs

* remove type ignore

* only keep 1 run_eval

* fix optional

* register task required

* register task required

* delete old tests

* delete old tests

* fixture return impl
This commit is contained in:
Xi Yan 2024-11-07 21:24:12 -08:00 committed by GitHub
parent 8350f2df4c
commit 6192bf43a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 916 additions and 389 deletions

View file

@ -14,6 +14,7 @@ from llama_stack.apis.scoring_functions import * # noqa: F403
from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.scoring import * # noqa: F403
from llama_stack.apis.eval_tasks import * # noqa: F403
@json_schema_type
@ -35,36 +36,57 @@ EvalCandidate = Annotated[
]
@json_schema_type
class BenchmarkEvalTaskConfig(BaseModel):
type: Literal["benchmark"] = "benchmark"
eval_candidate: EvalCandidate
@json_schema_type
class AppEvalTaskConfig(BaseModel):
type: Literal["app"] = "app"
eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run",
default_factory=dict,
)
# we could optinally add any specific dataset config here
EvalTaskConfig = Annotated[
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
]
@json_schema_type
class EvaluateResponse(BaseModel):
generations: List[Dict[str, Any]]
# each key in the dict is a scoring function name
scores: Dict[str, ScoringResult]
class Eval(Protocol):
@webmethod(route="/eval/evaluate_batch", method="POST")
async def evaluate_batch(
@webmethod(route="/eval/run_eval", method="POST")
async def run_eval(
self,
dataset_id: str,
candidate: EvalCandidate,
scoring_functions: List[str],
task_id: str,
task_config: EvalTaskConfig,
) -> Job: ...
@webmethod(route="/eval/evaluate", method="POST")
async def evaluate(
@webmethod(route="/eval/evaluate_rows", method="POST")
async def evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
candidate: EvalCandidate,
scoring_functions: List[str],
task_config: EvalTaskConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/job/status", method="GET")
async def job_status(self, job_id: str) -> Optional[JobStatus]: ...
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/job/cancel", method="POST")
async def job_cancel(self, job_id: str) -> None: ...
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/job/result", method="GET")
async def job_result(self, job_id: str) -> EvaluateResponse: ...
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .eval_tasks import * # noqa: F401 F403

View file

@ -0,0 +1,43 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
@json_schema_type
class EvalTaskDef(BaseModel):
identifier: str
dataset_id: str
scoring_functions: List[str]
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task",
)
@json_schema_type
class EvalTaskDefWithProvider(EvalTaskDef):
type: Literal["eval_task"] = "eval_task"
provider_id: str = Field(
description="ID of the provider which serves this dataset",
)
@runtime_checkable
class EvalTasks(Protocol):
@webmethod(route="/eval_tasks/list", method="GET")
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]: ...
@webmethod(route="/eval_tasks/get", method="GET")
async def get_eval_task(self, name: str) -> Optional[EvalTaskDefWithProvider]: ...
@webmethod(route="/eval_tasks/register", method="POST")
async def register_eval_task(
self, eval_task_def: EvalTaskDefWithProvider
) -> None: ...

View file

@ -48,11 +48,13 @@ class Scoring(Protocol):
async def score_batch(
self,
dataset_id: str,
scoring_functions: List[str],
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
save_results_dataset: bool = False,
) -> ScoreBatchResponse: ...
@webmethod(route="/scoring/score")
async def score(
self, input_rows: List[Dict[str, Any]], scoring_functions: List[str]
self,
input_rows: List[Dict[str, Any]],
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
) -> ScoreResponse: ...

View file

@ -4,34 +4,66 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
from enum import Enum
from typing import (
Any,
Dict,
List,
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
)
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from typing_extensions import Annotated
from llama_stack.apis.common.type_system import ParamType
@json_schema_type
class Parameter(BaseModel):
name: str
type: ParamType
description: Optional[str] = None
# Perhaps more structure can be imposed on these functions. Maybe they could be associated
# with standard metrics so they can be rolled up?
@json_schema_type
class ScoringConfigType(Enum):
llm_as_judge = "llm_as_judge"
regex_parser = "regex_parser"
class LLMAsJudgeContext(BaseModel):
@json_schema_type
class LLMAsJudgeScoringFnParams(BaseModel):
type: Literal[ScoringConfigType.llm_as_judge.value] = (
ScoringConfigType.llm_as_judge.value
)
judge_model: str
prompt_template: Optional[str] = None
judge_score_regex: Optional[List[str]] = Field(
description="Regex to extract the score from the judge response",
default=None,
judge_score_regexes: Optional[List[str]] = Field(
description="Regexes to extract the answer from generated response",
default_factory=list,
)
@json_schema_type
class RegexParserScoringFnParams(BaseModel):
type: Literal[ScoringConfigType.regex_parser.value] = (
ScoringConfigType.regex_parser.value
)
parsing_regexes: Optional[List[str]] = Field(
description="Regex to extract the answer from generated response",
default_factory=list,
)
ScoringFnParams = Annotated[
Union[
LLMAsJudgeScoringFnParams,
RegexParserScoringFnParams,
],
Field(discriminator="type"),
]
@json_schema_type
class ScoringFnDef(BaseModel):
identifier: str
@ -40,14 +72,13 @@ class ScoringFnDef(BaseModel):
default_factory=dict,
description="Any additional metadata for this definition",
)
parameters: List[Parameter] = Field(
description="List of parameters for the deterministic function",
default_factory=list,
)
return_type: ParamType = Field(
description="The return type of the deterministic function",
)
context: Optional[LLMAsJudgeContext] = None
params: Optional[ScoringFnParams] = Field(
description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
default=None,
)
# We can optionally add information here to support packaging of code, etc.