forked from phoenix-oss/llama-stack-mirror
grading
This commit is contained in:
parent
b98497ee56
commit
238cdc4e69
2 changed files with 221 additions and 207 deletions
228
docs/_static/llama-stack-spec.html
vendored
228
docs/_static/llama-stack-spec.html
vendored
|
@ -2035,49 +2035,6 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/evaluation/grade": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "The evaluation job containing grader scores.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/EvaluationJob"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Evaluation"
|
||||
],
|
||||
"description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/GradeRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/evaluation/grade_sync": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -2107,7 +2064,7 @@
|
|||
"tags": [
|
||||
"Evaluation"
|
||||
],
|
||||
"description": "Run an grading job with generated results inline.",
|
||||
"description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
|
@ -2121,6 +2078,49 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/evaluation/grading": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "The evaluation job containing grader scores.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/EvaluationJob"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Evaluation"
|
||||
],
|
||||
"description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/GradingRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/health": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
@ -2622,7 +2622,7 @@
|
|||
"tags": [
|
||||
"Benchmarks"
|
||||
],
|
||||
"description": "Register a new benchmark.",
|
||||
"description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
|
@ -3730,7 +3730,7 @@
|
|||
"tags": [
|
||||
"Evaluation"
|
||||
],
|
||||
"description": "Run an evaluation job.",
|
||||
"description": "Schedule a full evaluation job, by generating results using candidate and grading them.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
|
@ -3869,7 +3869,7 @@
|
|||
"tags": [
|
||||
"Evaluation"
|
||||
],
|
||||
"description": "Run an evaluation job inline.",
|
||||
"description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
|
@ -8615,19 +8615,81 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"GradeRequest": {
|
||||
"GradeSyncRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task": {
|
||||
"$ref": "#/components/schemas/EvaluationTask",
|
||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"task"
|
||||
],
|
||||
"title": "GradeRequest"
|
||||
"title": "GradeSyncRequest"
|
||||
},
|
||||
"EvaluationResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"generations": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"description": "The generations in rows for the evaluation."
|
||||
},
|
||||
"scores": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
},
|
||||
"description": "The scores for the evaluation. Map of grader id to ScoringResult."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"generations",
|
||||
"scores"
|
||||
],
|
||||
"title": "EvaluationResponse",
|
||||
"description": "A response to an inline evaluation."
|
||||
},
|
||||
"GradingRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task": {
|
||||
"$ref": "#/components/schemas/EvaluationTask",
|
||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"task"
|
||||
],
|
||||
"title": "GradingRequest"
|
||||
},
|
||||
"EvaluationCandidate": {
|
||||
"oneOf": [
|
||||
|
@ -8701,68 +8763,6 @@
|
|||
],
|
||||
"title": "EvaluationJob"
|
||||
},
|
||||
"GradeSyncRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task": {
|
||||
"$ref": "#/components/schemas/EvaluationTask",
|
||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"task"
|
||||
],
|
||||
"title": "GradeSyncRequest"
|
||||
},
|
||||
"EvaluationResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"generations": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"description": "The generations in rows for the evaluation."
|
||||
},
|
||||
"scores": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
},
|
||||
"description": "The scores for the evaluation. Map of grader id to ScoringResult."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"generations",
|
||||
"scores"
|
||||
],
|
||||
"title": "EvaluationResponse",
|
||||
"description": "A response to an inline evaluation."
|
||||
},
|
||||
"HealthInfo": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -10737,7 +10737,7 @@
|
|||
"properties": {
|
||||
"task": {
|
||||
"$ref": "#/components/schemas/EvaluationTask",
|
||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
},
|
||||
"candidate": {
|
||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||
|
@ -10839,7 +10839,7 @@
|
|||
"properties": {
|
||||
"task": {
|
||||
"$ref": "#/components/schemas/EvaluationTask",
|
||||
"description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||
},
|
||||
"candidate": {
|
||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue