grading

2025-03-18 18:12:06 -07:00 · 2025-03-18 18:12:06 -07:00 · 238cdc4e69
commit 238cdc4e69
parent b98497ee56
2 changed files with 221 additions and 207 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2035,49 +2035,6 @@
                ]
            }
        },
        "/v1/evaluation/grade": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The evaluation job containing grader scores.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvaluationJob"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Evaluation"
                ],
                "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/GradeRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/evaluation/grade_sync": {
            "post": {
                "responses": {
@ -2107,7 +2064,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an grading job with generated results inline.",
+                "description": "Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2121,6 +2078,49 @@
                }
            }
        },
        "/v1/evaluation/grading": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The evaluation job containing grader scores.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvaluationJob"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Evaluation"
                ],
                "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/GradingRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/health": {
            "get": {
                "responses": {
@ -2622,7 +2622,7 @@
                "tags": [
                    "Benchmarks"
                ],
-                "description": "Register a new benchmark.",
+                "description": "Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -3730,7 +3730,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an evaluation job.",
+                "description": "Schedule a full evaluation job, by generating results using candidate and grading them.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -3869,7 +3869,7 @@
                "tags": [
                    "Evaluation"
                ],
-                "description": "Run an evaluation job inline.",
+                "description": "Run an evaluation synchronously, i.e., without scheduling a job\". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -8615,19 +8615,81 @@
                    }
                }
            },
-            "GradeRequest": {
+            "GradeSyncRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "task"
                ],
-                "title": "GradeRequest"
+                "title": "GradeSyncRequest"
            },
            "EvaluationResponse": {
                "type": "object",
                "properties": {
                    "generations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The generations in rows for the evaluation."
                    },
                    "scores": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        },
                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "generations",
                    "scores"
                ],
                "title": "EvaluationResponse",
                "description": "A response to an inline evaluation."
            },
            "GradingRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "task"
                ],
                "title": "GradingRequest"
            },
            "EvaluationCandidate": {
                "oneOf": [
@ -8701,68 +8763,6 @@
                ],
                "title": "EvaluationJob"
            },
            "GradeSyncRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "task"
                ],
                "title": "GradeSyncRequest"
            },
            "EvaluationResponse": {
                "type": "object",
                "properties": {
                    "generations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The generations in rows for the evaluation."
                    },
                    "scores": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        },
                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "generations",
                    "scores"
                ],
                "title": "EvaluationResponse",
                "description": "A response to an inline evaluation."
            },
            "HealthInfo": {
                "type": "object",
                "properties": {
@ -10737,7 +10737,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10839,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1385,38 +1385,6 @@ paths:
          required: true
          schema:
            type: string
  /v1/evaluation/grade:
    post:
      responses:
        '200':
          description: >-
            The evaluation job containing grader scores.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationJob'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
      description: >-
        Run an grading job with generated results. Use this when you have generated
        results from inference in a dataset.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GradeRequest'
        required: true
  /v1/evaluation/grade_sync:
    post:
      responses:
@ -1441,7 +1409,10 @@ paths:
      tags:
        - Evaluation
      description: >-
-        Run an grading job with generated results inline.
+        Run grading synchronously on generated results, i.e., without scheduling a
        job. You should use this for quick testing, or when the number of rows is
        limited. Some implementations may have stricter restrictions on inputs which
        will be accepted.
      parameters: []
      requestBody:
        content:
@ -1449,6 +1420,38 @@ paths:
            schema:
              $ref: '#/components/schemas/GradeSyncRequest'
        required: true
  /v1/evaluation/grading:
    post:
      responses:
        '200':
          description: >-
            The evaluation job containing grader scores.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationJob'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
      description: >-
        Schedule a grading job, by grading generated results. The generated results
        are expected to be in the dataset.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GradingRequest'
        required: true
  /v1/health:
    get:
      responses:
@ -1800,7 +1803,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Benchmarks
-      description: Register a new benchmark.
+      description: >-
        Register a new benchmark. A benchmark consists of a dataset id and a list
        of grader ids.
      parameters: []
      requestBody:
        content:
@ -2566,7 +2571,9 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
-      description: Run an evaluation job.
+      description: >-
        Schedule a full evaluation job, by generating results using candidate and
        grading them.
      parameters: []
      requestBody:
        content:
@ -2661,7 +2668,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Evaluation
-      description: Run an evaluation job inline.
+      description: >-
        Run an evaluation synchronously, i.e., without scheduling a job". You should
        use this for quick testing, or when the number of rows is limited. Some implementations
        may have stricter restrictions on inputs which will be accepted.
      parameters: []
      requestBody:
        content:
@ -5956,20 +5966,65 @@ components:
          benchmark: '#/components/schemas/BenchmarkEvaluationTask'
          dataset: '#/components/schemas/DatasetEvaluationTask'
          data: '#/components/schemas/DataEvaluationTask'
-    GradeRequest:
+    GradeSyncRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
            of grader_ids
      additionalProperties: false
      required:
        - task
-      title: GradeRequest
+      title: GradeSyncRequest
    EvaluationResponse:
      type: object
      properties:
        generations:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: >-
            The generations in rows for the evaluation.
        scores:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            The scores for the evaluation. Map of grader id to ScoringResult.
      additionalProperties: false
      required:
        - generations
        - scores
      title: EvaluationResponse
      description: A response to an inline evaluation.
    GradingRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
            evaluation task against a data source (e.g. rows, uri, etc.) and a list
            of grader_ids
      additionalProperties: false
      required:
        - task
      title: GradingRequest
    EvaluationCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
@ -6023,49 +6078,6 @@ components:
        - task
        - candidate
      title: EvaluationJob
    GradeSyncRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
            a data source (e.g. rows, uri, etc.) and a list of grader_ids
      additionalProperties: false
      required:
        - task
      title: GradeSyncRequest
    EvaluationResponse:
      type: object
      properties:
        generations:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: >-
            The generations in rows for the evaluation.
        scores:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            The scores for the evaluation. Map of grader id to ScoringResult.
      additionalProperties: false
      required:
        - generations
        - scores
      title: EvaluationResponse
      description: A response to an inline evaluation.
    HealthInfo:
      type: object
      properties:
@ -7347,10 +7359,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
            of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
@ -7416,10 +7429,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+            evaluation task against a data source (e.g. rows, uri, etc.) and a list
            of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.