update EvaluationTask

2025-03-18 19:28:34 -07:00 · 2025-03-18 19:28:34 -07:00 · f107e3229b
commit f107e3229b
parent 5e817cd56a
3 changed files with 56 additions and 195 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2035,7 +2035,7 @@
                ]
            }
        },
-        "/v1/evaluation/grading": {
+        "/v1/evaluation/grade": {
            "post": {
                "responses": {
                    "200": {
@ -8523,32 +8523,14 @@
                ],
                "title": "VectorDB"
            },
-            "BenchmarkEvaluationTask": {
+            "EvaluationTask": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "benchmark",
                        "default": "benchmark"
                    },
                    "benchmark_id": {
                        "type": "string"
-                    }
+                    },
-                },
+                    "dataset_id": {
-                "additionalProperties": false,
+                        "type": "string"
                "required": [
                    "type",
                    "benchmark_id"
                ],
                "title": "BenchmarkEvaluationTask"
            },
            "DataEvaluationTask": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "data",
                        "default": "data"
                    },
                    "data_source": {
                        "$ref": "#/components/schemas/DataSource"
@ -8561,66 +8543,14 @@
                    }
                },
                "additionalProperties": false,
-                "required": [
+                "title": "EvaluationTask"
                    "type",
                    "data_source",
                    "grader_ids"
                ],
                "title": "DataEvaluationTask"
            },
            "DatasetEvaluationTask": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "dataset",
                        "default": "dataset"
                    },
                    "dataset_id": {
                        "type": "string"
                    },
                    "grader_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "dataset_id",
                    "grader_ids"
                ],
                "title": "DatasetEvaluationTask"
            },
            "EvaluationTask": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/BenchmarkEvaluationTask"
                    },
                    {
                        "$ref": "#/components/schemas/DatasetEvaluationTask"
                    },
                    {
                        "$ref": "#/components/schemas/DataEvaluationTask"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "benchmark": "#/components/schemas/BenchmarkEvaluationTask",
                        "dataset": "#/components/schemas/DatasetEvaluationTask",
                        "data": "#/components/schemas/DataEvaluationTask"
                    }
                }
            },
            "GradeRequest": {
                "type": "object",
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
@ -8706,7 +8636,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    }
                },
                "additionalProperties": false,
@ -10737,7 +10667,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
@ -10839,7 +10769,7 @@
                "properties": {
                    "task": {
                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1385,7 +1385,7 @@ paths:
          required: true
          schema:
            type: string
-  /v1/evaluation/grading:
+  /v1/evaluation/grade:
    post:
      responses:
        '200':
@ -5903,27 +5903,13 @@ components:
        - embedding_model
        - embedding_dimension
      title: VectorDB
-    BenchmarkEvaluationTask:
+    EvaluationTask:
      type: object
      properties:
        type:
          type: string
          const: benchmark
          default: benchmark
        benchmark_id:
          type: string
-      additionalProperties: false
+        dataset_id:
      required:
        - type
        - benchmark_id
      title: BenchmarkEvaluationTask
    DataEvaluationTask:
      type: object
      properties:
        type:
          type: string
          const: data
          default: data
        data_source:
          $ref: '#/components/schemas/DataSource'
        grader_ids:
@ -5931,52 +5917,18 @@ components:
          items:
            type: string
      additionalProperties: false
-      required:
+      title: EvaluationTask
        - type
        - data_source
        - grader_ids
      title: DataEvaluationTask
    DatasetEvaluationTask:
      type: object
      properties:
        type:
          type: string
          const: dataset
          default: dataset
        dataset_id:
          type: string
        grader_ids:
          type: array
          items:
            type: string
      additionalProperties: false
      required:
        - type
        - dataset_id
        - grader_ids
      title: DatasetEvaluationTask
    EvaluationTask:
      oneOf:
        - $ref: '#/components/schemas/BenchmarkEvaluationTask'
        - $ref: '#/components/schemas/DatasetEvaluationTask'
        - $ref: '#/components/schemas/DataEvaluationTask'
      discriminator:
        propertyName: type
        mapping:
          benchmark: '#/components/schemas/BenchmarkEvaluationTask'
          dataset: '#/components/schemas/DatasetEvaluationTask'
          data: '#/components/schemas/DataEvaluationTask'
    GradeRequest:
      type: object
      properties:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            The task to evaluate. To specify a task, one of the following must be
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
-            of grader_ids
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
      additionalProperties: false
      required:
        - task
@ -6040,11 +5992,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            The task to evaluate. To specify a task, one of the following must be
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
-            of grader_ids
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
      additionalProperties: false
      required:
        - task
@ -7359,11 +7311,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            The task to evaluate. To specify a task, one of the following must be
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
-            of grader_ids
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
@ -7429,11 +7381,11 @@ components:
        task:
          $ref: '#/components/schemas/EvaluationTask'
          description: >-
-            The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+            The task to evaluate. To specify a task, one of the following must be
-            task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+            provided: - `benchmark_id`: Run evaluation task against a benchmark_id
-            against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+            - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
-            evaluation task against a data source (e.g. rows, uri, etc.) and a list
+            and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
-            of grader_ids
+            task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        candidate:
          $ref: '#/components/schemas/EvaluationCandidate'
          description: The candidate to evaluate.
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(
@json_schema_type
-class BenchmarkEvaluationTask(BaseModel):
+class EvaluationTask(BaseModel):
-    type: Literal["benchmark"] = "benchmark"
+    benchmark_id: Optional[str] = None
-    benchmark_id: str
+    dataset_id: Optional[str] = None
-
+    data_source: Optional[DataSource] = None
-
+    grader_ids: Optional[List[str]] = None
@json_schema_type
 class DatasetEvaluationTask(BaseModel):
    type: Literal["dataset"] = "dataset"
    dataset_id: str
    grader_ids: List[str]
@json_schema_type
 class DataEvaluationTask(BaseModel):
    type: Literal["data"] = "data"
    data_source: DataSource
    grader_ids: List[str]
 EvaluationTask = register_schema(
    Annotated[
        Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
        Field(discriminator="type"),
    ],
    name="EvaluationTask",
 )
@json_schema_type
@ -121,10 +100,10 @@ class Evaluation(Protocol):
        """
        Schedule a full evaluation job, by generating results using candidate and grading them.
-        :param task: The task to evaluate. One of:
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - `benchmark_id`: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :param candidate: The candidate to evaluate.
        """
        ...
@ -139,23 +118,23 @@ class Evaluation(Protocol):
        Run an evaluation synchronously, i.e., without scheduling a job".
        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
-        :param task: The task to evaluate. One of:
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
-        - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - `benchmark_id`: Run evaluation task against a benchmark_id
-        - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
-        - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :param candidate: The candidate to evaluate.
        """
        ...
-    @webmethod(route="/evaluation/grading", method="POST")
+    @webmethod(route="/evaluation/grade", method="POST")
    async def grade(self, task: EvaluationTask) -> EvaluationJob:
        """
        Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
-        :param task: The task to evaluate. One of:
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - `benchmark_id`: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :return: The evaluation job containing grader scores.
        """
@ -167,10 +146,10 @@ class Evaluation(Protocol):
        Run grading synchronously on generated results, i.e., without scheduling a job.
        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
-        :param task: The task to evaluate. One of:
+        :param task: The task to evaluate. To specify a task, one of the following must be provided:
-         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - `benchmark_id`: Run evaluation task against a benchmark_id
-         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
        :return: The evaluation job containing grader scores. "generations" is not populated in the response.
        """