Merge branch 'eval_api_final' into delete_eval_scoring_scoring_fn

This commit is contained in:
Xi Yan 2025-03-19 09:50:40 -07:00
commit e23531c9d0
4 changed files with 50 additions and 38 deletions

View file

@ -7617,7 +7617,7 @@
"EvaluationResponse": { "EvaluationResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
"generations": { "result_rows": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "type": "object",
@ -7644,20 +7644,39 @@
] ]
} }
}, },
"description": "The generations in rows for the evaluation." "description": "The result data containing inputs, generations and grades in each row."
}, },
"scores": { "grades": {
"type": "object", "type": "object",
"additionalProperties": { "additionalProperties": {
"$ref": "#/components/schemas/ScoringResult" "oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}, },
"description": "The scores for the evaluation. Map of grader id to ScoringResult." "description": "Map of grader id to aggregated value."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"generations", "result_rows",
"scores" "grades"
], ],
"title": "EvaluationResponse", "title": "EvaluationResponse",
"description": "A response to an inline evaluation." "description": "A response to an inline evaluation."
@ -9313,14 +9332,14 @@
"properties": { "properties": {
"dataset_id": { "dataset_id": {
"type": "string", "type": "string",
"description": "The ID of the dataset to be used to run the benchmark." "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
}, },
"grader_ids": { "grader_ids": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
}, },
"description": "List of grader ids to use for this benchmark." "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
}, },
"benchmark_id": { "benchmark_id": {
"type": "string", "type": "string",

View file

@ -5328,7 +5328,7 @@ components:
EvaluationResponse: EvaluationResponse:
type: object type: object
properties: properties:
generations: result_rows:
type: array type: array
items: items:
type: object type: object
@ -5341,17 +5341,22 @@ components:
- type: array - type: array
- type: object - type: object
description: >- description: >-
The generations in rows for the evaluation. The result data containing inputs, generations and grades in each row.
scores: grades:
type: object type: object
additionalProperties: additionalProperties:
$ref: '#/components/schemas/ScoringResult' oneOf:
description: >- - type: 'null'
The scores for the evaluation. Map of grader id to ScoringResult. - type: boolean
- type: number
- type: string
- type: array
- type: object
description: Map of grader id to aggregated value.
additionalProperties: false additionalProperties: false
required: required:
- generations - result_rows
- scores - grades
title: EvaluationResponse title: EvaluationResponse
description: A response to an inline evaluation. description: A response to an inline evaluation.
ScoringResult: ScoringResult:
@ -6404,13 +6409,14 @@ components:
dataset_id: dataset_id:
type: string type: string
description: >- description: >-
The ID of the dataset to be used to run the benchmark. The ID of the dataset to be used to run the benchmark. ID obtained through
`datasets.register()`
grader_ids: grader_ids:
type: array type: array
items: items:
type: string type: string
description: >- description: >-
List of grader ids to use for this benchmark. List of grader ids to use for this benchmark. ID obtained through `graders.register()`
benchmark_id: benchmark_id:
type: string type: string
description: >- description: >-

View file

@ -62,8 +62,8 @@ class Benchmarks(Protocol):
""" """
Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids. Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
:param dataset_id: The ID of the dataset to be used to run the benchmark. :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
:param grader_ids: List of grader ids to use for this benchmark. :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated. :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
:param metadata: (Optional) Metadata for this benchmark for additional descriptions. :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
""" """

View file

@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
candidate: EvaluationCandidate candidate: EvaluationCandidate
@json_schema_type
class ScoringResult(BaseModel):
"""
A scoring result for a single row.
:param scores: The scoring result for each row. Each row is a map of grader column name to value.
:param metrics: Map of metric name to aggregated value.
"""
scores: List[Dict[str, Any]]
metrics: Dict[str, Any]
@json_schema_type @json_schema_type
class EvaluationResponse(BaseModel): class EvaluationResponse(BaseModel):
""" """
A response to an inline evaluation. A response to an inline evaluation.
:param generations: The generations in rows for the evaluation. :param result_rows: The result data containing inputs, generations and grades in each row.
:param scores: The scores for the evaluation. Map of grader id to ScoringResult. :param grades: Map of grader id to aggregated value.
""" """
generations: List[Dict[str, Any]] result_rows: List[Dict[str, Any]]
scores: Dict[str, ScoringResult] grades: Dict[str, Any]
class Evaluation(Protocol): class Evaluation(Protocol):