forked from phoenix-oss/llama-stack-mirror
grading
This commit is contained in:
parent
b98497ee56
commit
238cdc4e69
2 changed files with 221 additions and 207 deletions
228
docs/_static/llama-stack-spec.html
vendored
228
docs/_static/llama-stack-spec.html
vendored
|
@ -2035,49 +2035,6 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/evaluation/grade": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "The evaluation job containing grader scores.",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/EvaluationJob"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Evaluation"
|
|
||||||
],
|
|
||||||
"description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
|
|
||||||
"parameters": [],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/GradeRequest"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/evaluation/grade_sync": {
|
"/v1/evaluation/grade_sync": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -2107,7 +2064,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Evaluation"
|
"Evaluation"
|
||||||
],
|
],
|
||||||
"description": "Run an grading job with generated results inline.",
|
"description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
|
@ -2121,6 +2078,49 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/evaluation/grading": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "The evaluation job containing grader scores.",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/EvaluationJob"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Evaluation"
|
||||||
|
],
|
||||||
|
"description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
|
||||||
|
"parameters": [],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/GradingRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/v1/health": {
|
"/v1/health": {
|
||||||
"get": {
|
"get": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -2622,7 +2622,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Benchmarks"
|
"Benchmarks"
|
||||||
],
|
],
|
||||||
"description": "Register a new benchmark.",
|
"description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
|
@ -3730,7 +3730,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Evaluation"
|
"Evaluation"
|
||||||
],
|
],
|
||||||
"description": "Run an evaluation job.",
|
"description": "Schedule a full evaluation job, by generating results using candidate and grading them.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
|
@ -3869,7 +3869,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Evaluation"
|
"Evaluation"
|
||||||
],
|
],
|
||||||
"description": "Run an evaluation job inline.",
|
"description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
|
@ -8615,19 +8615,81 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"GradeRequest": {
|
"GradeSyncRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"task"
|
"task"
|
||||||
],
|
],
|
||||||
"title": "GradeRequest"
|
"title": "GradeSyncRequest"
|
||||||
|
},
|
||||||
|
"EvaluationResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"generations": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "The generations in rows for the evaluation."
|
||||||
|
},
|
||||||
|
"scores": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"$ref": "#/components/schemas/ScoringResult"
|
||||||
|
},
|
||||||
|
"description": "The scores for the evaluation. Map of grader id to ScoringResult."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"generations",
|
||||||
|
"scores"
|
||||||
|
],
|
||||||
|
"title": "EvaluationResponse",
|
||||||
|
"description": "A response to an inline evaluation."
|
||||||
|
},
|
||||||
|
"GradingRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"task": {
|
||||||
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
|
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"task"
|
||||||
|
],
|
||||||
|
"title": "GradingRequest"
|
||||||
},
|
},
|
||||||
"EvaluationCandidate": {
|
"EvaluationCandidate": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
|
@ -8701,68 +8763,6 @@
|
||||||
],
|
],
|
||||||
"title": "EvaluationJob"
|
"title": "EvaluationJob"
|
||||||
},
|
},
|
||||||
"GradeSyncRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"task": {
|
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
|
||||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"task"
|
|
||||||
],
|
|
||||||
"title": "GradeSyncRequest"
|
|
||||||
},
|
|
||||||
"EvaluationResponse": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"generations": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "array"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "object"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"description": "The generations in rows for the evaluation."
|
|
||||||
},
|
|
||||||
"scores": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"$ref": "#/components/schemas/ScoringResult"
|
|
||||||
},
|
|
||||||
"description": "The scores for the evaluation. Map of grader id to ScoringResult."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"generations",
|
|
||||||
"scores"
|
|
||||||
],
|
|
||||||
"title": "EvaluationResponse",
|
|
||||||
"description": "A response to an inline evaluation."
|
|
||||||
},
|
|
||||||
"HealthInfo": {
|
"HealthInfo": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -10737,7 +10737,7 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
},
|
},
|
||||||
"candidate": {
|
"candidate": {
|
||||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||||
|
@ -10839,7 +10839,7 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
},
|
},
|
||||||
"candidate": {
|
"candidate": {
|
||||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||||
|
|
200
docs/_static/llama-stack-spec.yaml
vendored
200
docs/_static/llama-stack-spec.yaml
vendored
|
@ -1385,38 +1385,6 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
/v1/evaluation/grade:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: >-
|
|
||||||
The evaluation job containing grader scores.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/EvaluationJob'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Evaluation
|
|
||||||
description: >-
|
|
||||||
Run an grading job with generated results. Use this when you have generated
|
|
||||||
results from inference in a dataset.
|
|
||||||
parameters: []
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/GradeRequest'
|
|
||||||
required: true
|
|
||||||
/v1/evaluation/grade_sync:
|
/v1/evaluation/grade_sync:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -1441,7 +1409,10 @@ paths:
|
||||||
tags:
|
tags:
|
||||||
- Evaluation
|
- Evaluation
|
||||||
description: >-
|
description: >-
|
||||||
Run an grading job with generated results inline.
|
Run grading synchronously on generated results, i.e., without scheduling a
|
||||||
|
job. You should use this for quick testing, or when the number of rows is
|
||||||
|
limited. Some implementations may have stricter restrictions on inputs which
|
||||||
|
will be accepted.
|
||||||
parameters: []
|
parameters: []
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
|
@ -1449,6 +1420,38 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/GradeSyncRequest'
|
$ref: '#/components/schemas/GradeSyncRequest'
|
||||||
required: true
|
required: true
|
||||||
|
/v1/evaluation/grading:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: >-
|
||||||
|
The evaluation job containing grader scores.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/EvaluationJob'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Evaluation
|
||||||
|
description: >-
|
||||||
|
Schedule a grading job, by grading generated results. The generated results
|
||||||
|
are expected to be in the dataset.
|
||||||
|
parameters: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/GradingRequest'
|
||||||
|
required: true
|
||||||
/v1/health:
|
/v1/health:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
|
@ -1800,7 +1803,9 @@ paths:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Benchmarks
|
- Benchmarks
|
||||||
description: Register a new benchmark.
|
description: >-
|
||||||
|
Register a new benchmark. A benchmark consists of a dataset id and a list
|
||||||
|
of grader ids.
|
||||||
parameters: []
|
parameters: []
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
|
@ -2566,7 +2571,9 @@ paths:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Evaluation
|
- Evaluation
|
||||||
description: Run an evaluation job.
|
description: >-
|
||||||
|
Schedule a full evaluation job, by generating results using candidate and
|
||||||
|
grading them.
|
||||||
parameters: []
|
parameters: []
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
|
@ -2661,7 +2668,10 @@ paths:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Evaluation
|
- Evaluation
|
||||||
description: Run an evaluation job inline.
|
description: >-
|
||||||
|
Run an evaluation synchronously, i.e., without scheduling a job". You should
|
||||||
|
use this for quick testing, or when the number of rows is limited. Some implementations
|
||||||
|
may have stricter restrictions on inputs which will be accepted.
|
||||||
parameters: []
|
parameters: []
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
|
@ -5956,20 +5966,65 @@ components:
|
||||||
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
|
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
|
||||||
dataset: '#/components/schemas/DatasetEvaluationTask'
|
dataset: '#/components/schemas/DatasetEvaluationTask'
|
||||||
data: '#/components/schemas/DataEvaluationTask'
|
data: '#/components/schemas/DataEvaluationTask'
|
||||||
GradeRequest:
|
GradeSyncRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
|
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
||||||
a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
|
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
||||||
and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
|
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
||||||
a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
||||||
|
of grader_ids
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- task
|
- task
|
||||||
title: GradeRequest
|
title: GradeSyncRequest
|
||||||
|
EvaluationResponse:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
generations:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: >-
|
||||||
|
The generations in rows for the evaluation.
|
||||||
|
scores:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
$ref: '#/components/schemas/ScoringResult'
|
||||||
|
description: >-
|
||||||
|
The scores for the evaluation. Map of grader id to ScoringResult.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- generations
|
||||||
|
- scores
|
||||||
|
title: EvaluationResponse
|
||||||
|
description: A response to an inline evaluation.
|
||||||
|
GradingRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
task:
|
||||||
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
|
description: >-
|
||||||
|
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
||||||
|
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
||||||
|
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
||||||
|
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
||||||
|
of grader_ids
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- task
|
||||||
|
title: GradingRequest
|
||||||
EvaluationCandidate:
|
EvaluationCandidate:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/ModelCandidate'
|
- $ref: '#/components/schemas/ModelCandidate'
|
||||||
|
@ -6023,49 +6078,6 @@ components:
|
||||||
- task
|
- task
|
||||||
- candidate
|
- candidate
|
||||||
title: EvaluationJob
|
title: EvaluationJob
|
||||||
GradeSyncRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
task:
|
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
|
||||||
description: >-
|
|
||||||
The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
|
|
||||||
a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
|
|
||||||
and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
|
|
||||||
a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- task
|
|
||||||
title: GradeSyncRequest
|
|
||||||
EvaluationResponse:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
generations:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
oneOf:
|
|
||||||
- type: 'null'
|
|
||||||
- type: boolean
|
|
||||||
- type: number
|
|
||||||
- type: string
|
|
||||||
- type: array
|
|
||||||
- type: object
|
|
||||||
description: >-
|
|
||||||
The generations in rows for the evaluation.
|
|
||||||
scores:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
$ref: '#/components/schemas/ScoringResult'
|
|
||||||
description: >-
|
|
||||||
The scores for the evaluation. Map of grader id to ScoringResult.
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- generations
|
|
||||||
- scores
|
|
||||||
title: EvaluationResponse
|
|
||||||
description: A response to an inline evaluation.
|
|
||||||
HealthInfo:
|
HealthInfo:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -7347,10 +7359,11 @@ components:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
|
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
||||||
a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
|
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
||||||
and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
|
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
||||||
a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
||||||
|
of grader_ids
|
||||||
candidate:
|
candidate:
|
||||||
$ref: '#/components/schemas/EvaluationCandidate'
|
$ref: '#/components/schemas/EvaluationCandidate'
|
||||||
description: The candidate to evaluate.
|
description: The candidate to evaluate.
|
||||||
|
@ -7416,10 +7429,11 @@ components:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
|
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
||||||
a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
|
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
||||||
and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
|
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
||||||
a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
||||||
|
of grader_ids
|
||||||
candidate:
|
candidate:
|
||||||
$ref: '#/components/schemas/EvaluationCandidate'
|
$ref: '#/components/schemas/EvaluationCandidate'
|
||||||
description: The candidate to evaluate.
|
description: The candidate to evaluate.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue