forked from phoenix-oss/llama-stack-mirror
[Evals API][10/n] API updates for EvalTaskDef + new test migration (#379)
* wip * scoring fn api * eval api * eval task * evaluate api update * pre commit * unwrap context -> config * config field doc * typo * naming fix * separate benchmark / app eval * api name * rename * wip tests * wip * datasetio test * delete unused * fixture * scoring resolve * fix scoring register * scoring test pass * score batch * scoring fix * fix eval * test eval works * remove type ignore * api refactor * add default task_eval_id for routing * add eval_id for jobs * remove type ignore * only keep 1 run_eval * fix optional * register task required * register task required * delete old tests * delete old tests * fixture return impl
This commit is contained in:
parent
8350f2df4c
commit
6192bf43a4
32 changed files with 916 additions and 389 deletions
|
@ -14,6 +14,7 @@ from llama_stack.apis.scoring_functions import * # noqa: F403
|
|||
from llama_stack.apis.agents import AgentConfig
|
||||
from llama_stack.apis.common.job_types import Job, JobStatus
|
||||
from llama_stack.apis.scoring import * # noqa: F403
|
||||
from llama_stack.apis.eval_tasks import * # noqa: F403
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
@ -35,36 +36,57 @@ EvalCandidate = Annotated[
|
|||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BenchmarkEvalTaskConfig(BaseModel):
|
||||
type: Literal["benchmark"] = "benchmark"
|
||||
eval_candidate: EvalCandidate
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AppEvalTaskConfig(BaseModel):
|
||||
type: Literal["app"] = "app"
|
||||
eval_candidate: EvalCandidate
|
||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||
default_factory=dict,
|
||||
)
|
||||
# we could optinally add any specific dataset config here
|
||||
|
||||
|
||||
EvalTaskConfig = Annotated[
|
||||
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
|
||||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateResponse(BaseModel):
|
||||
generations: List[Dict[str, Any]]
|
||||
|
||||
# each key in the dict is a scoring function name
|
||||
scores: Dict[str, ScoringResult]
|
||||
|
||||
|
||||
class Eval(Protocol):
|
||||
@webmethod(route="/eval/evaluate_batch", method="POST")
|
||||
async def evaluate_batch(
|
||||
@webmethod(route="/eval/run_eval", method="POST")
|
||||
async def run_eval(
|
||||
self,
|
||||
dataset_id: str,
|
||||
candidate: EvalCandidate,
|
||||
scoring_functions: List[str],
|
||||
task_id: str,
|
||||
task_config: EvalTaskConfig,
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/evaluate", method="POST")
|
||||
async def evaluate(
|
||||
@webmethod(route="/eval/evaluate_rows", method="POST")
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
candidate: EvalCandidate,
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/job/status", method="GET")
|
||||
async def job_status(self, job_id: str) -> Optional[JobStatus]: ...
|
||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
|
||||
@webmethod(route="/eval/job/cancel", method="POST")
|
||||
async def job_cancel(self, job_id: str) -> None: ...
|
||||
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||
|
||||
@webmethod(route="/eval/job/result", method="GET")
|
||||
async def job_result(self, job_id: str) -> EvaluateResponse: ...
|
||||
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
|
|
7
llama_stack/apis/eval_tasks/__init__.py
Normal file
7
llama_stack/apis/eval_tasks/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .eval_tasks import * # noqa: F401 F403
|
43
llama_stack/apis/eval_tasks/eval_tasks.py
Normal file
43
llama_stack/apis/eval_tasks/eval_tasks.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvalTaskDef(BaseModel):
|
||||
identifier: str
|
||||
dataset_id: str
|
||||
scoring_functions: List[str]
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Metadata for this evaluation task",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvalTaskDefWithProvider(EvalTaskDef):
|
||||
type: Literal["eval_task"] = "eval_task"
|
||||
provider_id: str = Field(
|
||||
description="ID of the provider which serves this dataset",
|
||||
)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class EvalTasks(Protocol):
|
||||
@webmethod(route="/eval_tasks/list", method="GET")
|
||||
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]: ...
|
||||
|
||||
@webmethod(route="/eval_tasks/get", method="GET")
|
||||
async def get_eval_task(self, name: str) -> Optional[EvalTaskDefWithProvider]: ...
|
||||
|
||||
@webmethod(route="/eval_tasks/register", method="POST")
|
||||
async def register_eval_task(
|
||||
self, eval_task_def: EvalTaskDefWithProvider
|
||||
) -> None: ...
|
|
@ -48,11 +48,13 @@ class Scoring(Protocol):
|
|||
async def score_batch(
|
||||
self,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
|
||||
save_results_dataset: bool = False,
|
||||
) -> ScoreBatchResponse: ...
|
||||
|
||||
@webmethod(route="/scoring/score")
|
||||
async def score(
|
||||
self, input_rows: List[Dict[str, Any]], scoring_functions: List[str]
|
||||
self,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
|
||||
) -> ScoreResponse: ...
|
||||
|
|
|
@ -4,34 +4,66 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
runtime_checkable,
|
||||
Union,
|
||||
)
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from llama_stack.apis.common.type_system import ParamType
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Parameter(BaseModel):
|
||||
name: str
|
||||
type: ParamType
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
# Perhaps more structure can be imposed on these functions. Maybe they could be associated
|
||||
# with standard metrics so they can be rolled up?
|
||||
@json_schema_type
|
||||
class ScoringConfigType(Enum):
|
||||
llm_as_judge = "llm_as_judge"
|
||||
regex_parser = "regex_parser"
|
||||
|
||||
|
||||
class LLMAsJudgeContext(BaseModel):
|
||||
@json_schema_type
|
||||
class LLMAsJudgeScoringFnParams(BaseModel):
|
||||
type: Literal[ScoringConfigType.llm_as_judge.value] = (
|
||||
ScoringConfigType.llm_as_judge.value
|
||||
)
|
||||
judge_model: str
|
||||
prompt_template: Optional[str] = None
|
||||
judge_score_regex: Optional[List[str]] = Field(
|
||||
description="Regex to extract the score from the judge response",
|
||||
default=None,
|
||||
judge_score_regexes: Optional[List[str]] = Field(
|
||||
description="Regexes to extract the answer from generated response",
|
||||
default_factory=list,
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RegexParserScoringFnParams(BaseModel):
|
||||
type: Literal[ScoringConfigType.regex_parser.value] = (
|
||||
ScoringConfigType.regex_parser.value
|
||||
)
|
||||
parsing_regexes: Optional[List[str]] = Field(
|
||||
description="Regex to extract the answer from generated response",
|
||||
default_factory=list,
|
||||
)
|
||||
|
||||
|
||||
ScoringFnParams = Annotated[
|
||||
Union[
|
||||
LLMAsJudgeScoringFnParams,
|
||||
RegexParserScoringFnParams,
|
||||
],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ScoringFnDef(BaseModel):
|
||||
identifier: str
|
||||
|
@ -40,14 +72,13 @@ class ScoringFnDef(BaseModel):
|
|||
default_factory=dict,
|
||||
description="Any additional metadata for this definition",
|
||||
)
|
||||
parameters: List[Parameter] = Field(
|
||||
description="List of parameters for the deterministic function",
|
||||
default_factory=list,
|
||||
)
|
||||
return_type: ParamType = Field(
|
||||
description="The return type of the deterministic function",
|
||||
)
|
||||
context: Optional[LLMAsJudgeContext] = None
|
||||
params: Optional[ScoringFnParams] = Field(
|
||||
description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
|
||||
default=None,
|
||||
)
|
||||
# We can optionally add information here to support packaging of code, etc.
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue