From a92756a4b7b53a9975f085ccdf3dd1a4db04a676 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 18 Mar 2025 22:09:35 -0700
Subject: [PATCH 1/3] result_data in evaluation response

---
 docs/_static/llama-stack-spec.html        | 33 ++++++++++++++++++-----
 docs/_static/llama-stack-spec.yaml        | 21 +++++++++------
 llama_stack/apis/evaluation/evaluation.py | 21 +++------------
 3 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 0f223b51b..2a294ea11 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -8653,7 +8653,7 @@
             "EvaluationResponse": {
                 "type": "object",
                 "properties": {
-                    "generations": {
+                    "result_data": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -8680,20 +8680,39 @@
                                 ]
                             }
                         },
-                        "description": "The generations in rows for the evaluation."
+                        "description": "The result data containing generations and grades in each row."
                     },
-                    "scores": {
+                    "metrics": {
                         "type": "object",
                         "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
                         },
-                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                        "description": "Map of metric name to aggregated value."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "generations",
-                    "scores"
+                    "result_data",
+                    "metrics"
                 ],
                 "title": "EvaluationResponse",
                 "description": "A response to an inline evaluation."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 7c4ea81b8..7508acd66 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -6018,7 +6018,7 @@ components:
     EvaluationResponse:
       type: object
       properties:
-        generations:
+        result_data:
           type: array
           items:
             type: object
@@ -6031,17 +6031,22 @@ components:
                 - type: array
                 - type: object
           description: >-
-            The generations in rows for the evaluation.
-        scores:
+            The result data containing generations and grades in each row.
+        metrics:
           type: object
           additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            The scores for the evaluation. Map of grader id to ScoringResult.
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Map of metric name to aggregated value.
       additionalProperties: false
       required:
-        - generations
-        - scores
+        - result_data
+        - metrics
       title: EvaluationResponse
       description: A response to an inline evaluation.
     HealthInfo:
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 269004b26..8d6fdd201 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
     candidate: EvaluationCandidate
 
 
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
-    :param metrics: Map of metric name to aggregated value.
-    """
-
-    scores: List[Dict[str, Any]]
-    metrics: Dict[str, Any]
-
-
 @json_schema_type
 class EvaluationResponse(BaseModel):
     """
     A response to an inline evaluation.
 
-    :param generations: The generations in rows for the evaluation.
-    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    :param result_data: The result data containing generations and grades in each row.
+    :param metrics: Map of metric name to aggregated value.
     """
 
-    generations: List[Dict[str, Any]]
-    scores: Dict[str, ScoringResult]
+    result_data: List[Dict[str, Any]]
+    metrics: Dict[str, Any]
 
 
 class Evaluation(Protocol):

From 42447729e4479d7b30e9653a378dd363f54d0f39 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 19 Mar 2025 09:48:30 -0700
Subject: [PATCH 2/3] update

---
 docs/_static/llama-stack-spec.html        | 12 ++++++------
 docs/_static/llama-stack-spec.yaml        | 12 ++++++------
 llama_stack/apis/evaluation/evaluation.py |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 2a294ea11..15b06257f 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -8653,7 +8653,7 @@
             "EvaluationResponse": {
                 "type": "object",
                 "properties": {
-                    "result_data": {
+                    "result_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -8680,9 +8680,9 @@
                                 ]
                             }
                         },
-                        "description": "The result data containing generations and grades in each row."
+                        "description": "The result data containing inputs, generations and grades in each row."
                     },
-                    "metrics": {
+                    "grades": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
@@ -8706,13 +8706,13 @@
                                 }
                             ]
                         },
-                        "description": "Map of metric name to aggregated value."
+                        "description": "Map of grader id to aggregated value."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "result_data",
-                    "metrics"
+                    "result_rows",
+                    "grades"
                 ],
                 "title": "EvaluationResponse",
                 "description": "A response to an inline evaluation."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 7508acd66..0f83dd3d7 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -6018,7 +6018,7 @@ components:
     EvaluationResponse:
       type: object
       properties:
-        result_data:
+        result_rows:
           type: array
           items:
             type: object
@@ -6031,8 +6031,8 @@ components:
                 - type: array
                 - type: object
           description: >-
-            The result data containing generations and grades in each row.
-        metrics:
+            The result data containing inputs, generations and grades in each row.
+        grades:
           type: object
           additionalProperties:
             oneOf:
@@ -6042,11 +6042,11 @@ components:
               - type: string
               - type: array
               - type: object
-          description: Map of metric name to aggregated value.
+          description: Map of grader id to aggregated value.
       additionalProperties: false
       required:
-        - result_data
-        - metrics
+        - result_rows
+        - grades
       title: EvaluationResponse
       description: A response to an inline evaluation.
     HealthInfo:
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 8d6fdd201..bde27e0be 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -81,12 +81,12 @@ class EvaluationResponse(BaseModel):
     """
     A response to an inline evaluation.
 
-    :param result_data: The result data containing generations and grades in each row.
-    :param metrics: Map of metric name to aggregated value.
+    :param result_rows: The result data containing inputs, generations and grades in each row.
+    :param grades: Map of grader id to aggregated value.
     """
 
-    result_data: List[Dict[str, Any]]
-    metrics: Dict[str, Any]
+    result_rows: List[Dict[str, Any]]
+    grades: Dict[str, Any]
 
 
 class Evaluation(Protocol):

From 0048274ec0bbdde9c9bfd478b06fd88e0c06de3a Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 19 Mar 2025 09:49:53 -0700
Subject: [PATCH 3/3] update

---
 docs/_static/llama-stack-spec.html        | 4 ++--
 docs/_static/llama-stack-spec.yaml        | 5 +++--
 llama_stack/apis/benchmarks/benchmarks.py | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 15b06257f..b1fe8e832 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -10328,14 +10328,14 @@
                 "properties": {
                     "dataset_id": {
                         "type": "string",
-                        "description": "The ID of the dataset to be used to run the benchmark."
+                        "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
                     },
                     "grader_ids": {
                         "type": "array",
                         "items": {
                             "type": "string"
                         },
-                        "description": "List of grader ids to use for this benchmark."
+                        "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
                     },
                     "benchmark_id": {
                         "type": "string",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 0f83dd3d7..f91744f98 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -7084,13 +7084,14 @@ components:
         dataset_id:
           type: string
           description: >-
-            The ID of the dataset to be used to run the benchmark.
+            The ID of the dataset to be used to run the benchmark. ID obtained through
+            `datasets.register()`
         grader_ids:
           type: array
           items:
             type: string
           description: >-
-            List of grader ids to use for this benchmark.
+            List of grader ids to use for this benchmark. ID obtained through `graders.register()`
         benchmark_id:
           type: string
           description: >-
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 3c5624e62..534aa6884 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -62,8 +62,8 @@ class Benchmarks(Protocol):
         """
         Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
 
-        :param dataset_id: The ID of the dataset to be used to run the benchmark.
-        :param grader_ids: List of grader ids to use for this benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
+        :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
         :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
         :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
         """