diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 57f37255b..16c21cbb1 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2035,6 +2035,49 @@ ] } }, + "/v1/evaluation/grading": { + "post": { + "responses": { + "200": { + "description": "The evaluation job containing grader scores.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluationJob" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Evaluation" + ], + "description": "Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GradeRequest" + } + } + }, + "required": true + } + } + }, "/v1/evaluation/grade_sync": { "post": { "responses": { @@ -2078,49 +2121,6 @@ } } }, - "/v1/evaluation/grading": { - "post": { - "responses": { - "200": { - "description": "The evaluation job containing grader scores.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Evaluation" - ], - "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GradingRequest" - } - } - }, - "required": true - } - } - }, "/v1/health": { "get": { "responses": { @@ -8615,7 +8615,7 @@ } } }, - "GradeSyncRequest": { + "GradeRequest": { "type": "object", "properties": { "task": { @@ -8627,69 +8627,7 @@ "required": [ "task" ], - "title": "GradeSyncRequest" - }, - "EvaluationResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The generations in rows for the evaluation." - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "The scores for the evaluation. Map of grader id to ScoringResult." - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ], - "title": "EvaluationResponse", - "description": "A response to an inline evaluation." - }, - "GradingRequest": { - "type": "object", - "properties": { - "task": { - "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" - } - }, - "additionalProperties": false, - "required": [ - "task" - ], - "title": "GradingRequest" + "title": "GradeRequest" }, "EvaluationCandidate": { "oneOf": [ @@ -8763,6 +8701,68 @@ ], "title": "EvaluationJob" }, + "GradeSyncRequest": { + "type": "object", + "properties": { + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + } + }, + "additionalProperties": false, + "required": [ + "task" + ], + "title": "GradeSyncRequest" + }, + "EvaluationResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The generations in rows for the evaluation." + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + }, + "description": "The scores for the evaluation. Map of grader id to ScoringResult." + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ], + "title": "EvaluationResponse", + "description": "A response to an inline evaluation." + }, "HealthInfo": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 60a8700f7..1711c93ec 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1385,6 +1385,38 @@ paths: required: true schema: type: string + /v1/evaluation/grading: + post: + responses: + '200': + description: >- + The evaluation job containing grader scores. + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluationJob' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Evaluation + description: >- + Schedule a grading job, by grading generated (model or agent) results. The + generated results are expected to be in the dataset. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GradeRequest' + required: true /v1/evaluation/grade_sync: post: responses: @@ -1420,38 +1452,6 @@ paths: schema: $ref: '#/components/schemas/GradeSyncRequest' required: true - /v1/evaluation/grading: - post: - responses: - '200': - description: >- - The evaluation job containing grader scores. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJob' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Evaluation - description: >- - Schedule a grading job, by grading generated results. The generated results - are expected to be in the dataset. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/GradingRequest' - required: true /v1/health: get: responses: @@ -5966,7 +5966,7 @@ components: benchmark: '#/components/schemas/BenchmarkEvaluationTask' dataset: '#/components/schemas/DatasetEvaluationTask' data: '#/components/schemas/DataEvaluationTask' - GradeSyncRequest: + GradeRequest: type: object properties: task: @@ -5980,51 +5980,7 @@ components: additionalProperties: false required: - task - title: GradeSyncRequest - EvaluationResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The generations in rows for the evaluation. - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - The scores for the evaluation. Map of grader id to ScoringResult. - additionalProperties: false - required: - - generations - - scores - title: EvaluationResponse - description: A response to an inline evaluation. - GradingRequest: - type: object - properties: - task: - $ref: '#/components/schemas/EvaluationTask' - description: >- - The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation - task against a benchmark_id - DatasetEvaluationTask: Run evaluation task - against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - evaluation task against a data source (e.g. rows, uri, etc.) and a list - of grader_ids - additionalProperties: false - required: - - task - title: GradingRequest + title: GradeRequest EvaluationCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -6078,6 +6034,50 @@ components: - task - candidate title: EvaluationJob + GradeSyncRequest: + type: object + properties: + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation + task against a benchmark_id - DatasetEvaluationTask: Run evaluation task + against a dataset_id and a list of grader_ids - DataEvaluationTask: Run + evaluation task against a data source (e.g. rows, uri, etc.) and a list + of grader_ids + additionalProperties: false + required: + - task + title: GradeSyncRequest + EvaluationResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The generations in rows for the evaluation. + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: >- + The scores for the evaluation. Map of grader id to ScoringResult. + additionalProperties: false + required: + - generations + - scores + title: EvaluationResponse + description: A response to an inline evaluation. HealthInfo: type: object properties: diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 8425000da..6f6a27041 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -148,9 +148,9 @@ class Evaluation(Protocol): ... @webmethod(route="/evaluation/grading", method="POST") - async def grading(self, task: EvaluationTask) -> EvaluationJob: + async def grade(self, task: EvaluationTask) -> EvaluationJob: """ - Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset. + Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset. :param task: The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id