Merge branch 'eval_api_final' into delete_eval_scoring_scoring_fn

2025-12-30 22:30:01 +00:00 · 2025-03-19 09:50:40 -07:00 · 2025-03-19 09:50:40 -07:00 · e23531c9d0
commit e23531c9d0
parent aaa5974dce 0048274ec0
4 changed files with 50 additions and 38 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -7617,7 +7617,7 @@
            "EvaluationResponse": {
                "type": "object",
                "properties": {
-                    "generations": {
+                    "result_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
@ -7644,20 +7644,39 @@
                                ]
                            }
                        },
-                        "description": "The generations in rows for the evaluation."
+                        "description": "The result data containing inputs, generations and grades in each row."
                    },
-                    "scores": {
+                    "grades": {
                        "type": "object",
                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
                        },
-                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                        "description": "Map of grader id to aggregated value."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "generations",
-                    "scores"
+                    "result_rows",
+                    "grades"
                ],
                "title": "EvaluationResponse",
                "description": "A response to an inline evaluation."
@ -9313,14 +9332,14 @@
                "properties": {
                    "dataset_id": {
                        "type": "string",
-                        "description": "The ID of the dataset to be used to run the benchmark."
+                        "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
                    },
                    "grader_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
-                        "description": "List of grader ids to use for this benchmark."
+                        "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
                    },
                    "benchmark_id": {
                        "type": "string",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -5328,7 +5328,7 @@ components:
    EvaluationResponse:
      type: object
      properties:
-        generations:
+        result_rows:
          type: array
          items:
            type: object
@ -5341,17 +5341,22 @@ components:
                - type: array
                - type: object
          description: >-
-            The generations in rows for the evaluation.
-        scores:
+            The result data containing inputs, generations and grades in each row.
+        grades:
          type: object
          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            The scores for the evaluation. Map of grader id to ScoringResult.
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Map of grader id to aggregated value.
      additionalProperties: false
      required:
-        - generations
-        - scores
+        - result_rows
+        - grades
      title: EvaluationResponse
      description: A response to an inline evaluation.
    ScoringResult:
@ -6404,13 +6409,14 @@ components:
        dataset_id:
          type: string
          description: >-
-            The ID of the dataset to be used to run the benchmark.
+            The ID of the dataset to be used to run the benchmark. ID obtained through
+            `datasets.register()`
        grader_ids:
          type: array
          items:
            type: string
          description: >-
-            List of grader ids to use for this benchmark.
+            List of grader ids to use for this benchmark. ID obtained through `graders.register()`
        benchmark_id:
          type: string
          description: >-
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -62,8 +62,8 @@ class Benchmarks(Protocol):
        """
        Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.

-        :param dataset_id: The ID of the dataset to be used to run the benchmark.
-        :param grader_ids: List of grader ids to use for this benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
+        :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
        :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
        :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
        """
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
    candidate: EvaluationCandidate


-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
-    :param metrics: Map of metric name to aggregated value.
-    """
-
-    scores: List[Dict[str, Any]]
-    metrics: Dict[str, Any]
-
-
@json_schema_type
 class EvaluationResponse(BaseModel):
    """
    A response to an inline evaluation.

-    :param generations: The generations in rows for the evaluation.
-    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    :param result_rows: The result data containing inputs, generations and grades in each row.
+    :param grades: Map of grader id to aggregated value.
    """

-    generations: List[Dict[str, Any]]
-    scores: Dict[str, ScoringResult]
+    result_rows: List[Dict[str, Any]]
+    grades: Dict[str, Any]


 class Evaluation(Protocol):