This commit is contained in:
Xi Yan 2025-03-18 15:01:41 -07:00
parent a8b0467ec3
commit a69759613a
7 changed files with 2486 additions and 389 deletions

View file

@ -62,7 +62,7 @@ class Benchmarks(Protocol):
"""
Register a new benchmark.
:param dataset_id: The ID of the dataset to used to run the benchmark.
:param dataset_id: The ID of the dataset to be used to run the benchmark.
:param grader_ids: List of grader ids to use for this benchmark.
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
@ -87,3 +87,10 @@ class Benchmarks(Protocol):
:param benchmark_id: The ID of the benchmark to get.
"""
...
@webmethod(route="/benchmarks/{benchmark_id}", method="DELETE")
async def unregister_benchmark(self, benchmark_id: str) -> None:
"""
Unregister a benchmark by ID.
"""
...

View file

@ -5,7 +5,6 @@
# the root directory of this source tree.
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel
@ -38,12 +37,12 @@ class CommonJobFields(BaseModel):
:param id: The ID of the job.
:param status: The status of the job.
:param created_at: The time the job was created.
:param ended_at: The time the job ended.
:param completed_at: The time the job completed.
:param error: If status of the job is failed, this will contain the error message.
"""
id: str
status: JobStatus
created_at: datetime
ended_at: Optional[datetime] = None
error: Optional[str] = None
completed_at: datetime | None = None
error: str | None = None

View file

@ -48,28 +48,28 @@ EvaluationCandidate = register_schema(
@json_schema_type
class BenchmarkTask(BaseModel):
type: Literal["benchmark_id"] = "benchmark_id"
class BenchmarkEvaluationTask(BaseModel):
type: Literal["benchmark"] = "benchmark"
benchmark_id: str
@json_schema_type
class DatasetGraderTask(BaseModel):
type: Literal["dataset_grader"] = "dataset_grader"
class DatasetEvaluationTask(BaseModel):
type: Literal["dataset"] = "dataset"
dataset_id: str
grader_ids: List[str]
@json_schema_type
class DataSourceGraderTask(BaseModel):
type: Literal["data_source_grader"] = "data_source_grader"
class DataEvaluationTask(BaseModel):
type: Literal["data"] = "data"
data_source: DataSource
grader_ids: List[str]
EvaluationTask = register_schema(
Annotated[
Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
Field(discriminator="type"),
],
name="EvaluationTask",

View file

@ -29,6 +29,13 @@ from .graders import * # noqa: F401 F403
class GraderType(Enum):
"""
A type of grader. Each type is a criteria for evaluating answers.
:cvar llm: Use an LLM to score the answer.
:cvar regex_parser: Use a regex parser to score the answer.
:cvar equality: Check if the answer is equal to the reference answer.
:cvar subset_of: Check if the answer is a subset of the reference answer.
:cvar factuality: Check if the answer is factually correct using LLM as judge.
:cvar faithfulness: Check if the answer is faithful to the reference answer using LLM as judge.
"""
llm = "llm"
@ -221,9 +228,9 @@ class Graders(Protocol):
...
@webmethod(route="/graders/{grader_id:path}", method="DELETE")
async def delete_grader(self, grader_id: str) -> None:
async def unregister_grader(self, grader_id: str) -> None:
"""
Delete a grader by ID.
Unregister a grader by ID.
:param grader_id: The ID of the grader.
"""
...