update EvaluationTask

This commit is contained in:
Xi Yan 2025-03-18 19:28:34 -07:00
parent 5e817cd56a
commit f107e3229b
3 changed files with 56 additions and 195 deletions

View file

@ -2035,7 +2035,7 @@
]
}
},
"/v1/evaluation/grading": {
"/v1/evaluation/grade": {
"post": {
"responses": {
"200": {
@ -8523,32 +8523,14 @@
],
"title": "VectorDB"
},
"BenchmarkEvaluationTask": {
"EvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "benchmark",
"default": "benchmark"
},
"benchmark_id": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"benchmark_id"
],
"title": "BenchmarkEvaluationTask"
},
"DataEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "data",
"default": "data"
},
"dataset_id": {
"type": "string"
},
"data_source": {
"$ref": "#/components/schemas/DataSource"
@ -8561,66 +8543,14 @@
}
},
"additionalProperties": false,
"required": [
"type",
"data_source",
"grader_ids"
],
"title": "DataEvaluationTask"
},
"DatasetEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "dataset",
"default": "dataset"
},
"dataset_id": {
"type": "string"
},
"grader_ids": {
"type": "array",
"items": {
"type": "string"
}
}
},
"additionalProperties": false,
"required": [
"type",
"dataset_id",
"grader_ids"
],
"title": "DatasetEvaluationTask"
},
"EvaluationTask": {
"oneOf": [
{
"$ref": "#/components/schemas/BenchmarkEvaluationTask"
},
{
"$ref": "#/components/schemas/DatasetEvaluationTask"
},
{
"$ref": "#/components/schemas/DataEvaluationTask"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"benchmark": "#/components/schemas/BenchmarkEvaluationTask",
"dataset": "#/components/schemas/DatasetEvaluationTask",
"data": "#/components/schemas/DataEvaluationTask"
}
}
"title": "EvaluationTask"
},
"GradeRequest": {
"type": "object",
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}
},
"additionalProperties": false,
@ -8706,7 +8636,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}
},
"additionalProperties": false,
@ -10737,7 +10667,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
},
"candidate": {
"$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10769,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
},
"candidate": {
"$ref": "#/components/schemas/EvaluationCandidate",

View file

@ -1385,7 +1385,7 @@ paths:
required: true
schema:
type: string
/v1/evaluation/grading:
/v1/evaluation/grade:
post:
responses:
'200':
@ -5903,27 +5903,13 @@ components:
- embedding_model
- embedding_dimension
title: VectorDB
BenchmarkEvaluationTask:
EvaluationTask:
type: object
properties:
type:
type: string
const: benchmark
default: benchmark
benchmark_id:
type: string
additionalProperties: false
required:
- type
- benchmark_id
title: BenchmarkEvaluationTask
DataEvaluationTask:
type: object
properties:
type:
dataset_id:
type: string
const: data
default: data
data_source:
$ref: '#/components/schemas/DataSource'
grader_ids:
@ -5931,52 +5917,18 @@ components:
items:
type: string
additionalProperties: false
required:
- type
- data_source
- grader_ids
title: DataEvaluationTask
DatasetEvaluationTask:
type: object
properties:
type:
type: string
const: dataset
default: dataset
dataset_id:
type: string
grader_ids:
type: array
items:
type: string
additionalProperties: false
required:
- type
- dataset_id
- grader_ids
title: DatasetEvaluationTask
EvaluationTask:
oneOf:
- $ref: '#/components/schemas/BenchmarkEvaluationTask'
- $ref: '#/components/schemas/DatasetEvaluationTask'
- $ref: '#/components/schemas/DataEvaluationTask'
discriminator:
propertyName: type
mapping:
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
dataset: '#/components/schemas/DatasetEvaluationTask'
data: '#/components/schemas/DataEvaluationTask'
title: EvaluationTask
GradeRequest:
type: object
properties:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
evaluation task against a data source (e.g. rows, uri, etc.) and a list
of grader_ids
The task to evaluate. To specify a task, one of the following must be
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false
required:
- task
@ -6040,11 +5992,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
evaluation task against a data source (e.g. rows, uri, etc.) and a list
of grader_ids
The task to evaluate. To specify a task, one of the following must be
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false
required:
- task
@ -7359,11 +7311,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
evaluation task against a data source (e.g. rows, uri, etc.) and a list
of grader_ids
The task to evaluate. To specify a task, one of the following must be
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate:
$ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate.
@ -7429,11 +7381,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
evaluation task against a data source (e.g. rows, uri, etc.) and a list
of grader_ids
The task to evaluate. To specify a task, one of the following must be
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate:
$ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate.

View file

@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(
@json_schema_type
class BenchmarkEvaluationTask(BaseModel):
type: Literal["benchmark"] = "benchmark"
benchmark_id: str
@json_schema_type
class DatasetEvaluationTask(BaseModel):
type: Literal["dataset"] = "dataset"
dataset_id: str
grader_ids: List[str]
@json_schema_type
class DataEvaluationTask(BaseModel):
type: Literal["data"] = "data"
data_source: DataSource
grader_ids: List[str]
EvaluationTask = register_schema(
Annotated[
Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
Field(discriminator="type"),
],
name="EvaluationTask",
)
class EvaluationTask(BaseModel):
benchmark_id: Optional[str] = None
dataset_id: Optional[str] = None
data_source: Optional[DataSource] = None
grader_ids: Optional[List[str]] = None
@json_schema_type
@ -121,10 +100,10 @@ class Evaluation(Protocol):
"""
Schedule a full evaluation job, by generating results using candidate and grading them.
:param task: The task to evaluate. One of:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param task: The task to evaluate. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate.
"""
...
@ -139,23 +118,23 @@ class Evaluation(Protocol):
Run an evaluation synchronously, i.e., without scheduling a job".
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
:param task: The task to evaluate. One of:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param task: The task to evaluate. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate.
"""
...
@webmethod(route="/evaluation/grading", method="POST")
@webmethod(route="/evaluation/grade", method="POST")
async def grade(self, task: EvaluationTask) -> EvaluationJob:
"""
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
:param task: The task to evaluate. One of:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param task: The task to evaluate. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores.
"""
@ -167,10 +146,10 @@ class Evaluation(Protocol):
Run grading synchronously on generated results, i.e., without scheduling a job.
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
:param task: The task to evaluate. One of:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param task: The task to evaluate. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores. "generations" is not populated in the response.
"""