From 238cdc4e69b26ce9cc89c06b1e7a1112af0787ce Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 18 Mar 2025 18:12:06 -0700
Subject: [PATCH] grading

---
 docs/_static/llama-stack-spec.html | 228 ++++++++++++++---------------
 docs/_static/llama-stack-spec.yaml | 200 +++++++++++++------------
 2 files changed, 221 insertions(+), 207 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index cb5959e22..57f37255b 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2035,49 +2035,6 @@
                 ]
             }
         },
-        "/v1/evaluation/grade": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "The evaluation job containing grader scores.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Evaluation"
-                ],
-                "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/GradeRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/evaluation/grade_sync": {
             "post": {
                 "responses": {
@@ -2107,7 +2064,7 @@
                 "tags": [
                     "Evaluation"
                 ],
-                "description": "Run an grading job with generated results inline.",
+                "description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -2121,6 +2078,49 @@
                 }
             }
         },
+        "/v1/evaluation/grading": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The evaluation job containing grader scores.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluationJob"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Evaluation"
+                ],
+                "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/GradingRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/health": {
             "get": {
                 "responses": {
@@ -2622,7 +2622,7 @@
                 "tags": [
                     "Benchmarks"
                 ],
-                "description": "Register a new benchmark.",
+                "description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -3730,7 +3730,7 @@
                 "tags": [
                     "Evaluation"
                 ],
-                "description": "Run an evaluation job.",
+                "description": "Schedule a full evaluation job, by generating results using candidate and grading them.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -3869,7 +3869,7 @@
                 "tags": [
                     "Evaluation"
                 ],
-                "description": "Run an evaluation job inline.",
+                "description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -8615,19 +8615,81 @@
                     }
                 }
             },
-            "GradeRequest": {
+            "GradeSyncRequest": {
                 "type": "object",
                 "properties": {
                     "task": {
                         "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "task"
                 ],
-                "title": "GradeRequest"
+                "title": "GradeSyncRequest"
+            },
+            "EvaluationResponse": {
+                "type": "object",
+                "properties": {
+                    "generations": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The generations in rows for the evaluation."
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        },
+                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "generations",
+                    "scores"
+                ],
+                "title": "EvaluationResponse",
+                "description": "A response to an inline evaluation."
+            },
+            "GradingRequest": {
+                "type": "object",
+                "properties": {
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task"
+                ],
+                "title": "GradingRequest"
             },
             "EvaluationCandidate": {
                 "oneOf": [
@@ -8701,68 +8763,6 @@
                 ],
                 "title": "EvaluationJob"
             },
-            "GradeSyncRequest": {
-                "type": "object",
-                "properties": {
-                    "task": {
-                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "task"
-                ],
-                "title": "GradeSyncRequest"
-            },
-            "EvaluationResponse": {
-                "type": "object",
-                "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The generations in rows for the evaluation."
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "generations",
-                    "scores"
-                ],
-                "title": "EvaluationResponse",
-                "description": "A response to an inline evaluation."
-            },
             "HealthInfo": {
                 "type": "object",
                 "properties": {
@@ -10737,7 +10737,7 @@
                 "properties": {
                     "task": {
                         "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                     },
                     "candidate": {
                         "$ref": "#/components/schemas/EvaluationCandidate",
@@ -10839,7 +10839,7 @@
                 "properties": {
                     "task": {
                         "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                     },
                     "candidate": {
                         "$ref": "#/components/schemas/EvaluationCandidate",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index ecc8104e1..60a8700f7 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1385,38 +1385,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/evaluation/grade:
-    post:
-      responses:
-        '200':
-          description: >-
-            The evaluation job containing grader scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJob'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Evaluation
-      description: >-
-        Run an grading job with generated results. Use this when you have generated
-        results from inference in a dataset.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/GradeRequest'
-        required: true
   /v1/evaluation/grade_sync:
     post:
       responses:
@@ -1441,7 +1409,10 @@ paths:
       tags:
         - Evaluation
       description: >-
-        Run an grading job with generated results inline.
+        Run grading synchronously on generated results, i.e., without scheduling a
+        job. You should use this for quick testing, or when the number of rows is
+        limited. Some implementations may have stricter restrictions on inputs which
+        will be accepted.
       parameters: []
       requestBody:
         content:
@@ -1449,6 +1420,38 @@ paths:
             schema:
               $ref: '#/components/schemas/GradeSyncRequest'
         required: true
+  /v1/evaluation/grading:
+    post:
+      responses:
+        '200':
+          description: >-
+            The evaluation job containing grader scores.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluationJob'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Evaluation
+      description: >-
+        Schedule a grading job, by grading generated results. The generated results
+        are expected to be in the dataset.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GradingRequest'
+        required: true
   /v1/health:
     get:
       responses:
@@ -1800,7 +1803,9 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
-      description: Register a new benchmark.
+      description: >-
+        Register a new benchmark. A benchmark consists of a dataset id and a list
+        of grader ids.
       parameters: []
       requestBody:
         content:
@@ -2566,7 +2571,9 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Evaluation
-      description: Run an evaluation job.
+      description: >-
+        Schedule a full evaluation job, by generating results using candidate and
+        grading them.
       parameters: []
       requestBody:
         content:
@@ -2661,7 +2668,10 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Evaluation
-      description: Run an evaluation job inline.
+      description: >-
+        Run an evaluation synchronously, i.e., without scheduling a job". You should
+        use this for quick testing, or when the number of rows is limited. Some implementations
+        may have stricter restrictions on inputs which will be accepted.
       parameters: []
       requestBody:
         content:
@@ -5956,20 +5966,65 @@ components:
           benchmark: '#/components/schemas/BenchmarkEvaluationTask'
           dataset: '#/components/schemas/DatasetEvaluationTask'
           data: '#/components/schemas/DataEvaluationTask'
-    GradeRequest:
+    GradeSyncRequest:
       type: object
       properties:
         task:
           $ref: '#/components/schemas/EvaluationTask'
           description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
       additionalProperties: false
       required:
         - task
-      title: GradeRequest
+      title: GradeSyncRequest
+    EvaluationResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The generations in rows for the evaluation.
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            The scores for the evaluation. Map of grader id to ScoringResult.
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+      title: EvaluationResponse
+      description: A response to an inline evaluation.
+    GradingRequest:
+      type: object
+      properties:
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
+      additionalProperties: false
+      required:
+        - task
+      title: GradingRequest
     EvaluationCandidate:
       oneOf:
         - $ref: '#/components/schemas/ModelCandidate'
@@ -6023,49 +6078,6 @@ components:
         - task
         - candidate
       title: EvaluationJob
-    GradeSyncRequest:
-      type: object
-      properties:
-        task:
-          $ref: '#/components/schemas/EvaluationTask'
-          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
-      additionalProperties: false
-      required:
-        - task
-      title: GradeSyncRequest
-    EvaluationResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: >-
-            The generations in rows for the evaluation.
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            The scores for the evaluation. Map of grader id to ScoringResult.
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-      title: EvaluationResponse
-      description: A response to an inline evaluation.
     HealthInfo:
       type: object
       properties:
@@ -7347,10 +7359,11 @@ components:
         task:
           $ref: '#/components/schemas/EvaluationTask'
           description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
         candidate:
           $ref: '#/components/schemas/EvaluationCandidate'
           description: The candidate to evaluate.
@@ -7416,10 +7429,11 @@ components:
         task:
           $ref: '#/components/schemas/EvaluationTask'
           description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            of grader_ids
         candidate:
           $ref: '#/components/schemas/EvaluationCandidate'
           description: The candidate to evaluate.