update EvaluationTask

This commit is contained in:
Xi Yan 2025-03-18 19:28:34 -07:00
parent 5e817cd56a
commit f107e3229b
3 changed files with 56 additions and 195 deletions

View file

@ -2035,7 +2035,7 @@
] ]
} }
}, },
"/v1/evaluation/grading": { "/v1/evaluation/grade": {
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
@ -8523,32 +8523,14 @@
], ],
"title": "VectorDB" "title": "VectorDB"
}, },
"BenchmarkEvaluationTask": { "EvaluationTask": {
"type": "object", "type": "object",
"properties": { "properties": {
"type": {
"type": "string",
"const": "benchmark",
"default": "benchmark"
},
"benchmark_id": { "benchmark_id": {
"type": "string" "type": "string"
} },
}, "dataset_id": {
"additionalProperties": false, "type": "string"
"required": [
"type",
"benchmark_id"
],
"title": "BenchmarkEvaluationTask"
},
"DataEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "data",
"default": "data"
}, },
"data_source": { "data_source": {
"$ref": "#/components/schemas/DataSource" "$ref": "#/components/schemas/DataSource"
@ -8561,66 +8543,14 @@
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "title": "EvaluationTask"
"type",
"data_source",
"grader_ids"
],
"title": "DataEvaluationTask"
},
"DatasetEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "dataset",
"default": "dataset"
},
"dataset_id": {
"type": "string"
},
"grader_ids": {
"type": "array",
"items": {
"type": "string"
}
}
},
"additionalProperties": false,
"required": [
"type",
"dataset_id",
"grader_ids"
],
"title": "DatasetEvaluationTask"
},
"EvaluationTask": {
"oneOf": [
{
"$ref": "#/components/schemas/BenchmarkEvaluationTask"
},
{
"$ref": "#/components/schemas/DatasetEvaluationTask"
},
{
"$ref": "#/components/schemas/DataEvaluationTask"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"benchmark": "#/components/schemas/BenchmarkEvaluationTask",
"dataset": "#/components/schemas/DatasetEvaluationTask",
"data": "#/components/schemas/DataEvaluationTask"
}
}
}, },
"GradeRequest": { "GradeRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"task": { "task": {
"$ref": "#/components/schemas/EvaluationTask", "$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -8706,7 +8636,7 @@
"properties": { "properties": {
"task": { "task": {
"$ref": "#/components/schemas/EvaluationTask", "$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -10737,7 +10667,7 @@
"properties": { "properties": {
"task": { "task": {
"$ref": "#/components/schemas/EvaluationTask", "$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}, },
"candidate": { "candidate": {
"$ref": "#/components/schemas/EvaluationCandidate", "$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10769,7 @@
"properties": { "properties": {
"task": { "task": {
"$ref": "#/components/schemas/EvaluationTask", "$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}, },
"candidate": { "candidate": {
"$ref": "#/components/schemas/EvaluationCandidate", "$ref": "#/components/schemas/EvaluationCandidate",

View file

@ -1385,7 +1385,7 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/evaluation/grading: /v1/evaluation/grade:
post: post:
responses: responses:
'200': '200':
@ -5903,27 +5903,13 @@ components:
- embedding_model - embedding_model
- embedding_dimension - embedding_dimension
title: VectorDB title: VectorDB
BenchmarkEvaluationTask: EvaluationTask:
type: object type: object
properties: properties:
type:
type: string
const: benchmark
default: benchmark
benchmark_id: benchmark_id:
type: string type: string
additionalProperties: false dataset_id:
required:
- type
- benchmark_id
title: BenchmarkEvaluationTask
DataEvaluationTask:
type: object
properties:
type:
type: string type: string
const: data
default: data
data_source: data_source:
$ref: '#/components/schemas/DataSource' $ref: '#/components/schemas/DataSource'
grader_ids: grader_ids:
@ -5931,52 +5917,18 @@ components:
items: items:
type: string type: string
additionalProperties: false additionalProperties: false
required: title: EvaluationTask
- type
- data_source
- grader_ids
title: DataEvaluationTask
DatasetEvaluationTask:
type: object
properties:
type:
type: string
const: dataset
default: dataset
dataset_id:
type: string
grader_ids:
type: array
items:
type: string
additionalProperties: false
required:
- type
- dataset_id
- grader_ids
title: DatasetEvaluationTask
EvaluationTask:
oneOf:
- $ref: '#/components/schemas/BenchmarkEvaluationTask'
- $ref: '#/components/schemas/DatasetEvaluationTask'
- $ref: '#/components/schemas/DataEvaluationTask'
discriminator:
propertyName: type
mapping:
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
dataset: '#/components/schemas/DatasetEvaluationTask'
data: '#/components/schemas/DataEvaluationTask'
GradeRequest: GradeRequest:
type: object type: object
properties: properties:
task: task:
$ref: '#/components/schemas/EvaluationTask' $ref: '#/components/schemas/EvaluationTask'
description: >- description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation The task to evaluate. To specify a task, one of the following must be
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task provided: - `benchmark_id`: Run evaluation task against a benchmark_id
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
evaluation task against a data source (e.g. rows, uri, etc.) and a list and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
of grader_ids task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false additionalProperties: false
required: required:
- task - task
@ -6040,11 +5992,11 @@ components:
task: task:
$ref: '#/components/schemas/EvaluationTask' $ref: '#/components/schemas/EvaluationTask'
description: >- description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation The task to evaluate. To specify a task, one of the following must be
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task provided: - `benchmark_id`: Run evaluation task against a benchmark_id
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
evaluation task against a data source (e.g. rows, uri, etc.) and a list and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
of grader_ids task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false additionalProperties: false
required: required:
- task - task
@ -7359,11 +7311,11 @@ components:
task: task:
$ref: '#/components/schemas/EvaluationTask' $ref: '#/components/schemas/EvaluationTask'
description: >- description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation The task to evaluate. To specify a task, one of the following must be
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task provided: - `benchmark_id`: Run evaluation task against a benchmark_id
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
evaluation task against a data source (e.g. rows, uri, etc.) and a list and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
of grader_ids task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate: candidate:
$ref: '#/components/schemas/EvaluationCandidate' $ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate. description: The candidate to evaluate.
@ -7429,11 +7381,11 @@ components:
task: task:
$ref: '#/components/schemas/EvaluationTask' $ref: '#/components/schemas/EvaluationTask'
description: >- description: >-
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation The task to evaluate. To specify a task, one of the following must be
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task provided: - `benchmark_id`: Run evaluation task against a benchmark_id
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
evaluation task against a data source (e.g. rows, uri, etc.) and a list and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
of grader_ids task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate: candidate:
$ref: '#/components/schemas/EvaluationCandidate' $ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate. description: The candidate to evaluate.

View file

@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(
@json_schema_type @json_schema_type
class BenchmarkEvaluationTask(BaseModel): class EvaluationTask(BaseModel):
type: Literal["benchmark"] = "benchmark" benchmark_id: Optional[str] = None
benchmark_id: str dataset_id: Optional[str] = None
data_source: Optional[DataSource] = None
grader_ids: Optional[List[str]] = None
@json_schema_type
class DatasetEvaluationTask(BaseModel):
type: Literal["dataset"] = "dataset"
dataset_id: str
grader_ids: List[str]
@json_schema_type
class DataEvaluationTask(BaseModel):
type: Literal["data"] = "data"
data_source: DataSource
grader_ids: List[str]
EvaluationTask = register_schema(
Annotated[
Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
Field(discriminator="type"),
],
name="EvaluationTask",
)
@json_schema_type @json_schema_type
@ -121,10 +100,10 @@ class Evaluation(Protocol):
""" """
Schedule a full evaluation job, by generating results using candidate and grading them. Schedule a full evaluation job, by generating results using candidate and grading them.
:param task: The task to evaluate. One of: :param task: The task to evaluate. To specify a task, one of the following must be provided:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - `benchmark_id`: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate. :param candidate: The candidate to evaluate.
""" """
... ...
@ -139,23 +118,23 @@ class Evaluation(Protocol):
Run an evaluation synchronously, i.e., without scheduling a job". Run an evaluation synchronously, i.e., without scheduling a job".
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
:param task: The task to evaluate. One of: :param task: The task to evaluate. To specify a task, one of the following must be provided:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - `benchmark_id`: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate. :param candidate: The candidate to evaluate.
""" """
... ...
@webmethod(route="/evaluation/grading", method="POST") @webmethod(route="/evaluation/grade", method="POST")
async def grade(self, task: EvaluationTask) -> EvaluationJob: async def grade(self, task: EvaluationTask) -> EvaluationJob:
""" """
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset. Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
:param task: The task to evaluate. One of: :param task: The task to evaluate. To specify a task, one of the following must be provided:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - `benchmark_id`: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores. :return: The evaluation job containing grader scores.
""" """
@ -167,10 +146,10 @@ class Evaluation(Protocol):
Run grading synchronously on generated results, i.e., without scheduling a job. Run grading synchronously on generated results, i.e., without scheduling a job.
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
:param task: The task to evaluate. One of: :param task: The task to evaluate. To specify a task, one of the following must be provided:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - `benchmark_id`: Run evaluation task against a benchmark_id
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores. "generations" is not populated in the response. :return: The evaluation job containing grader scores. "generations" is not populated in the response.
""" """