From 238cdc4e69b26ce9cc89c06b1e7a1112af0787ce Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 18:12:06 -0700 Subject: [PATCH] grading --- docs/_static/llama-stack-spec.html | 228 ++++++++++++++--------------- docs/_static/llama-stack-spec.yaml | 200 +++++++++++++------------ 2 files changed, 221 insertions(+), 207 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index cb5959e22..57f37255b 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2035,49 +2035,6 @@ ] } }, - "/v1/evaluation/grade": { - "post": { - "responses": { - "200": { - "description": "The evaluation job containing grader scores.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Evaluation" - ], - "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GradeRequest" - } - } - }, - "required": true - } - } - }, "/v1/evaluation/grade_sync": { "post": { "responses": { @@ -2107,7 +2064,7 @@ "tags": [ "Evaluation" ], - "description": "Run an grading job with generated results inline.", + "description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.", "parameters": [], "requestBody": { "content": { @@ -2121,6 +2078,49 @@ } } }, + "/v1/evaluation/grading": { + "post": { + "responses": { + "200": { + "description": "The evaluation job containing grader scores.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluationJob" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Evaluation" + ], + "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GradingRequest" + } + } + }, + "required": true + } + } + }, "/v1/health": { "get": { "responses": { @@ -2622,7 +2622,7 @@ "tags": [ "Benchmarks" ], - "description": "Register a new benchmark.", + "description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.", "parameters": [], "requestBody": { "content": { @@ -3730,7 +3730,7 @@ "tags": [ "Evaluation" ], - "description": "Run an evaluation job.", + "description": "Schedule a full evaluation job, by generating results using candidate and grading them.", "parameters": [], "requestBody": { "content": { @@ -3869,7 +3869,7 @@ "tags": [ "Evaluation" ], - "description": "Run an evaluation job inline.", + "description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.", "parameters": [], "requestBody": { "content": { @@ -8615,19 +8615,81 @@ } } }, - "GradeRequest": { + "GradeSyncRequest": { "type": "object", "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" } }, "additionalProperties": false, "required": [ "task" ], - "title": "GradeRequest" + "title": "GradeSyncRequest" + }, + "EvaluationResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The generations in rows for the evaluation." + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + }, + "description": "The scores for the evaluation. Map of grader id to ScoringResult." + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ], + "title": "EvaluationResponse", + "description": "A response to an inline evaluation." + }, + "GradingRequest": { + "type": "object", + "properties": { + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + } + }, + "additionalProperties": false, + "required": [ + "task" + ], + "title": "GradingRequest" }, "EvaluationCandidate": { "oneOf": [ @@ -8701,68 +8763,6 @@ ], "title": "EvaluationJob" }, - "GradeSyncRequest": { - "type": "object", - "properties": { - "task": { - "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" - } - }, - "additionalProperties": false, - "required": [ - "task" - ], - "title": "GradeSyncRequest" - }, - "EvaluationResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The generations in rows for the evaluation." - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "The scores for the evaluation. Map of grader id to ScoringResult." - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ], - "title": "EvaluationResponse", - "description": "A response to an inline evaluation." - }, "HealthInfo": { "type": "object", "properties": { @@ -10737,7 +10737,7 @@ "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" }, "candidate": { "$ref": "#/components/schemas/EvaluationCandidate", @@ -10839,7 +10839,7 @@ "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" }, "candidate": { "$ref": "#/components/schemas/EvaluationCandidate", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index ecc8104e1..60a8700f7 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1385,38 +1385,6 @@ paths: required: true schema: type: string - /v1/evaluation/grade: - post: - responses: - '200': - description: >- - The evaluation job containing grader scores. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJob' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Evaluation - description: >- - Run an grading job with generated results. Use this when you have generated - results from inference in a dataset. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/GradeRequest' - required: true /v1/evaluation/grade_sync: post: responses: @@ -1441,7 +1409,10 @@ paths: tags: - Evaluation description: >- - Run an grading job with generated results inline. + Run grading synchronously on generated results, i.e., without scheduling a + job. You should use this for quick testing, or when the number of rows is + limited. Some implementations may have stricter restrictions on inputs which + will be accepted. parameters: [] requestBody: content: @@ -1449,6 +1420,38 @@ paths: schema: $ref: '#/components/schemas/GradeSyncRequest' required: true + /v1/evaluation/grading: + post: + responses: + '200': + description: >- + The evaluation job containing grader scores. + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluationJob' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Evaluation + description: >- + Schedule a grading job, by grading generated results. The generated results + are expected to be in the dataset. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GradingRequest' + required: true /v1/health: get: responses: @@ -1800,7 +1803,9 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: Register a new benchmark. + description: >- + Register a new benchmark. A benchmark consists of a dataset id and a list + of grader ids. parameters: [] requestBody: content: @@ -2566,7 +2571,9 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Evaluation - description: Run an evaluation job. + description: >- + Schedule a full evaluation job, by generating results using candidate and + grading them. parameters: [] requestBody: content: @@ -2661,7 +2668,10 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Evaluation - description: Run an evaluation job inline. + description: >- + Run an evaluation synchronously, i.e., without scheduling a job". You should + use this for quick testing, or when the number of rows is limited. Some implementations + may have stricter restrictions on inputs which will be accepted. parameters: [] requestBody: content: @@ -5956,20 +5966,65 @@ components: benchmark: '#/components/schemas/BenchmarkEvaluationTask' dataset: '#/components/schemas/DatasetEvaluationTask' data: '#/components/schemas/DataEvaluationTask' - GradeRequest: + GradeSyncRequest: type: object properties: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkTask: Run evaluation task against - a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id - and a list of grader_ids - DataSourceGraderTask: Run evaluation task against - a data source (e.g. rows, uri, etc.) and a list of grader_ids + The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation + task against a benchmark_id - DatasetEvaluationTask: Run evaluation task + against a dataset_id and a list of grader_ids - DataEvaluationTask: Run + evaluation task against a data source (e.g. rows, uri, etc.) and a list + of grader_ids additionalProperties: false required: - task - title: GradeRequest + title: GradeSyncRequest + EvaluationResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The generations in rows for the evaluation. + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: >- + The scores for the evaluation. Map of grader id to ScoringResult. + additionalProperties: false + required: + - generations + - scores + title: EvaluationResponse + description: A response to an inline evaluation. + GradingRequest: + type: object + properties: + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation + task against a benchmark_id - DatasetEvaluationTask: Run evaluation task + against a dataset_id and a list of grader_ids - DataEvaluationTask: Run + evaluation task against a data source (e.g. rows, uri, etc.) and a list + of grader_ids + additionalProperties: false + required: + - task + title: GradingRequest EvaluationCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -6023,49 +6078,6 @@ components: - task - candidate title: EvaluationJob - GradeSyncRequest: - type: object - properties: - task: - $ref: '#/components/schemas/EvaluationTask' - description: >- - The task to evaluate. One of: - BenchmarkTask: Run evaluation task against - a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id - and a list of grader_ids - DataSourceGraderTask: Run evaluation task against - a data source (e.g. rows, uri, etc.) and a list of grader_ids - additionalProperties: false - required: - - task - title: GradeSyncRequest - EvaluationResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The generations in rows for the evaluation. - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - The scores for the evaluation. Map of grader id to ScoringResult. - additionalProperties: false - required: - - generations - - scores - title: EvaluationResponse - description: A response to an inline evaluation. HealthInfo: type: object properties: @@ -7347,10 +7359,11 @@ components: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkTask: Run evaluation task against - a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id - and a list of grader_ids - DataSourceGraderTask: Run evaluation task against - a data source (e.g. rows, uri, etc.) and a list of grader_ids + The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation + task against a benchmark_id - DatasetEvaluationTask: Run evaluation task + against a dataset_id and a list of grader_ids - DataEvaluationTask: Run + evaluation task against a data source (e.g. rows, uri, etc.) and a list + of grader_ids candidate: $ref: '#/components/schemas/EvaluationCandidate' description: The candidate to evaluate. @@ -7416,10 +7429,11 @@ components: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkTask: Run evaluation task against - a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id - and a list of grader_ids - DataSourceGraderTask: Run evaluation task against - a data source (e.g. rows, uri, etc.) and a list of grader_ids + The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation + task against a benchmark_id - DatasetEvaluationTask: Run evaluation task + against a dataset_id and a list of grader_ids - DataEvaluationTask: Run + evaluation task against a data source (e.g. rows, uri, etc.) and a list + of grader_ids candidate: $ref: '#/components/schemas/EvaluationCandidate' description: The candidate to evaluate.