update EvaluationTask

2025-03-18 19:28:34 -07:00 · 2025-03-18 19:28:34 -07:00 · f107e3229b
commit f107e3229b
parent 5e817cd56a
3 changed files with 56 additions and 195 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2035,7 +2035,7 @@
                ]
            }
        },
-        "/v1/evaluation/grading": {
+        "/v1/evaluation/grade": {
            "post": {
                "responses": {
                    "200": {
@ -8523,32 +8523,14 @@
                ],
                "title": "VectorDB"
            },
-            "BenchmarkEvaluationTask": {
+            "EvaluationTask": {
                "type": "object",
                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
                    "benchmark_id": {
                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "benchmark_id"
-                ],
-                "title": "BenchmarkEvaluationTask"
-            },
-            "DataEvaluationTask": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "data",
-                        "default": "data"
+                    },
+                    "dataset_id": {
+                        "type": "string"
                    },
                    "data_source": {
                        "$ref": "#/components/schemas/DataSource"
@ -8561,66 +8543,14 @@
                    }
                },
                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "data_source",
-                    "grader_ids"
-                ],
-                "title": "DataEvaluationTask"
-            },
-            "DatasetEvaluationTask": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "dataset",
-                        "default": "dataset"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "grader_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "dataset_id",
-                    "grader_ids"
-                ],
-                "title": "DatasetEvaluationTask"
-            },
-            "EvaluationTask": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/BenchmarkEvaluationTask"
-                    },
-                    {
-                        "$ref": "#/components/schemas/DatasetEvaluationTask"
-                    },
-                    {
-                        "$ref": "#/components/schemas/DataEvaluationTask"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkEvaluationTask",
-                        "dataset": "#/components/schemas/DatasetEvaluationTask",
-                        "data": "#/components/schemas/DataEvaluationTask"
-                    }
-                }
+                "title": "EvaluationTask"
            },
            "GradeRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
@ -8706,7 +8636,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
@ -10737,7 +10667,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10769,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1385,7 +1385,7 @@ paths:
          required: true
          schema:
            type: string
-  /v1/evaluation/grading:
+  /v1/evaluation/grade:
    post:
      responses:
        '200':
@ -5903,27 +5903,13 @@ components:
        - embedding_model
        - embedding_dimension
      title: VectorDB
-    BenchmarkEvaluationTask:
+    EvaluationTask:
      type: object
      properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
        benchmark_id:
          type: string
-      additionalProperties: false
-      required:
-        - type
-        - benchmark_id
-      title: BenchmarkEvaluationTask
-    DataEvaluationTask:
-      type: object
-      properties:
-        type:
+        dataset_id:
          type: string
-          const: data
-          default: data
        data_source:
          $ref: '#/components/schemas/DataSource'
        grader_ids:
@ -5931,52 +5917,18 @@ components:
          items:
            type: string
      additionalProperties: false
-      required:
-        - type
-        - data_source
-        - grader_ids
-      title: DataEvaluationTask
-    DatasetEvaluationTask:
-      type: object
-      properties:
-        type:
-          type: string
-          const: dataset
-          default: dataset
-        dataset_id:
-          type: string
-        grader_ids:
-          type: array
-          items:
-            type: string
-      additionalProperties: false
-      required:
-        - type
-        - dataset_id
-        - grader_ids
-      title: DatasetEvaluationTask
-    EvaluationTask:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkEvaluationTask'
-        - $ref: '#/components/schemas/DatasetEvaluationTask'
-        - $ref: '#/components/schemas/DataEvaluationTask'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkEvaluationTask'
-          dataset: '#/components/schemas/DatasetEvaluationTask'
-          data: '#/components/schemas/DataEvaluationTask'
+      title: EvaluationTask
    GradeRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
-            of grader_ids
+            The task to evaluate. To specify a task, one of the following must be
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
      additionalProperties: false
      required:
        - task
@ -6040,11 +5992,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
-            of grader_ids
+            The task to evaluate. To specify a task, one of the following must be
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
      additionalProperties: false
      required:
        - task
@ -7359,11 +7311,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
-            of grader_ids
+            The task to evaluate. To specify a task, one of the following must be
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
@ -7429,11 +7381,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
-            of grader_ids
+            The task to evaluate. To specify a task, one of the following must be
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(


@json_schema_type
-class BenchmarkEvaluationTask(BaseModel):
-    type: Literal["benchmark"] = "benchmark"
-    benchmark_id: str
-
-
-@json_schema_type
-class DatasetEvaluationTask(BaseModel):
-    type: Literal["dataset"] = "dataset"
-    dataset_id: str
-    grader_ids: List[str]
-
-
-@json_schema_type
-class DataEvaluationTask(BaseModel):
-    type: Literal["data"] = "data"
-    data_source: DataSource
-    grader_ids: List[str]
-
-
-EvaluationTask = register_schema(
-    Annotated[
-        Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
-        Field(discriminator="type"),
-    ],
-    name="EvaluationTask",
-)
+class EvaluationTask(BaseModel):
+    benchmark_id: Optional[str] = None
+    dataset_id: Optional[str] = None
+    data_source: Optional[DataSource] = None
+    grader_ids: Optional[List[str]] = None


@json_schema_type
@ -121,10 +100,10 @@ class Evaluation(Protocol):
        """
        Schedule a full evaluation job, by generating results using candidate and grading them.

-        :param task: The task to evaluate. One of:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
+         - `benchmark_id`: Run evaluation task against a benchmark_id
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :param candidate: The candidate to evaluate.
        """
        ...
@ -139,23 +118,23 @@ class Evaluation(Protocol):
        Run an evaluation synchronously, i.e., without scheduling a job".
        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.

-        :param task: The task to evaluate. One of:
-        - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
-        - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
-        - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
+         - `benchmark_id`: Run evaluation task against a benchmark_id
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :param candidate: The candidate to evaluate.
        """
        ...

-    @webmethod(route="/evaluation/grading", method="POST")
+    @webmethod(route="/evaluation/grade", method="POST")
    async def grade(self, task: EvaluationTask) -> EvaluationJob:
        """
        Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.

-        :param task: The task to evaluate. One of:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
+         - `benchmark_id`: Run evaluation task against a benchmark_id
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids

        :return: The evaluation job containing grader scores.
        """
@ -167,10 +146,10 @@ class Evaluation(Protocol):
        Run grading synchronously on generated results, i.e., without scheduling a job.
        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.

-        :param task: The task to evaluate. One of:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
+         - `benchmark_id`: Run evaluation task against a benchmark_id
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids

        :return: The evaluation job containing grader scores. "generations" is not populated in the response.
        """