grading

2025-03-18 18:12:06 -07:00 · 2025-03-18 18:12:06 -07:00 · 238cdc4e69
commit 238cdc4e69
parent b98497ee56
2 changed files with 221 additions and 207 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2035,49 +2035,6 @@
                ]
            }
        },
-        "/v1/evaluation/grade": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "The evaluation job containing grader scores.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Evaluation"
-                ],
-                "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/GradeRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/evaluation/grade_sync": {
            "post": {
                "responses": {
@ -2107,7 +2064,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an grading job with generated results inline.",
+                "description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2121,6 +2078,49 @@
                }
            }
        },
+        "/v1/evaluation/grading": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The evaluation job containing grader scores.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluationJob"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Evaluation"
+                ],
+                "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/GradingRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/health": {
            "get": {
                "responses": {
@ -2622,7 +2622,7 @@
                "tags": [
                    "Benchmarks"
                ],
-                "description": "Register a new benchmark.",
+                "description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -3730,7 +3730,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an evaluation job.",
+                "description": "Schedule a full evaluation job, by generating results using candidate and grading them.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -3869,7 +3869,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an evaluation job inline.",
+                "description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -8615,19 +8615,81 @@
                    }
                }
            },
-            "GradeRequest": {
+            "GradeSyncRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "task"
                ],
-                "title": "GradeRequest"
+                "title": "GradeSyncRequest"
+            },
+            "EvaluationResponse": {
+                "type": "object",
+                "properties": {
+                    "generations": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The generations in rows for the evaluation."
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        },
+                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "generations",
+                    "scores"
+                ],
+                "title": "EvaluationResponse",
+                "description": "A response to an inline evaluation."
+            },
+            "GradingRequest": {
+                "type": "object",
+                "properties": {
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task"
+                ],
+                "title": "GradingRequest"
            },
            "EvaluationCandidate": {
                "oneOf": [
@ -8701,68 +8763,6 @@
                ],
                "title": "EvaluationJob"
            },
-            "GradeSyncRequest": {
-                "type": "object",
-                "properties": {
-                    "task": {
-                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "task"
-                ],
-                "title": "GradeSyncRequest"
-            },
-            "EvaluationResponse": {
-                "type": "object",
-                "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The generations in rows for the evaluation."
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "generations",
-                    "scores"
-                ],
-                "title": "EvaluationResponse",
-                "description": "A response to an inline evaluation."
-            },
            "HealthInfo": {
                "type": "object",
                "properties": {
@ -10737,7 +10737,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10839,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1385,38 +1385,6 @@ paths:
          required: true
          schema:
            type: string
-  /v1/evaluation/grade:
-    post:
-      responses:
-        '200':
-          description: >-
-            The evaluation job containing grader scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJob'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Evaluation
-      description: >-
-        Run an grading job with generated results. Use this when you have generated
-        results from inference in a dataset.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/GradeRequest'
-        required: true
  /v1/evaluation/grade_sync:
    post:
      responses:
@ -1441,7 +1409,10 @@ paths:
      tags:
        - Evaluation
      description: >-
-        Run an grading job with generated results inline.
+        Run grading synchronously on generated results, i.e., without scheduling a
+        job. You should use this for quick testing, or when the number of rows is
+        limited. Some implementations may have stricter restrictions on inputs which
+        will be accepted.
      parameters: []
      requestBody:
        content:
@ -1449,6 +1420,38 @@ paths:
            schema:
              $ref: '#/components/schemas/GradeSyncRequest'
        required: true
+  /v1/evaluation/grading:
+    post:
+      responses:
+        '200':
+          description: >-
+            The evaluation job containing grader scores.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluationJob'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Evaluation
+      description: >-
+        Schedule a grading job, by grading generated results. The generated results
+        are expected to be in the dataset.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GradingRequest'
+        required: true
  /v1/health:
    get:
      responses:
@ -1800,7 +1803,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Benchmarks
-      description: Register a new benchmark.
+      description: >-
+        Register a new benchmark. A benchmark consists of a dataset id and a list
+        of grader ids.
      parameters: []
      requestBody:
        content:
@ -2566,7 +2571,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
-      description: Run an evaluation job.
+      description: >-
+        Schedule a full evaluation job, by generating results using candidate and
+        grading them.
      parameters: []
      requestBody:
        content:
@ -2661,7 +2668,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
-      description: Run an evaluation job inline.
+      description: >-
+        Run an evaluation synchronously, i.e., without scheduling a job". You should
+        use this for quick testing, or when the number of rows is limited. Some implementations
+        may have stricter restrictions on inputs which will be accepted.
      parameters: []
      requestBody:
        content:
@ -5956,20 +5966,65 @@ components:
          benchmark: '#/components/schemas/BenchmarkEvaluationTask'
          dataset: '#/components/schemas/DatasetEvaluationTask'
          data: '#/components/schemas/DataEvaluationTask'
-    GradeRequest:
+    GradeSyncRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
      additionalProperties: false
      required:
        - task
-      title: GradeRequest
+      title: GradeSyncRequest
+    EvaluationResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The generations in rows for the evaluation.
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            The scores for the evaluation. Map of grader id to ScoringResult.
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+      title: EvaluationResponse
+      description: A response to an inline evaluation.
+    GradingRequest:
+      type: object
+      properties:
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
+      additionalProperties: false
+      required:
+        - task
+      title: GradingRequest
    EvaluationCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
@ -6023,49 +6078,6 @@ components:
        - task
        - candidate
      title: EvaluationJob
-    GradeSyncRequest:
-      type: object
-      properties:
-        task:
-          $ref: '#/components/schemas/EvaluationTask'
-          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
-      additionalProperties: false
-      required:
-        - task
-      title: GradeSyncRequest
-    EvaluationResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: >-
-            The generations in rows for the evaluation.
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            The scores for the evaluation. Map of grader id to ScoringResult.
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-      title: EvaluationResponse
-      description: A response to an inline evaluation.
    HealthInfo:
      type: object
      properties:
@ -7347,10 +7359,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
@ -7416,10 +7429,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.