diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index fd7d767ae..af1f97ca0 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -7617,7 +7617,7 @@
             "EvaluationResponse": {
                 "type": "object",
                 "properties": {
-                    "generations": {
+                    "result_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -7644,20 +7644,39 @@
                                 ]
                             }
                         },
-                        "description": "The generations in rows for the evaluation."
+                        "description": "The result data containing inputs, generations and grades in each row."
                     },
-                    "scores": {
+                    "grades": {
                         "type": "object",
                         "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
                         },
-                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                        "description": "Map of grader id to aggregated value."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "generations",
-                    "scores"
+                    "result_rows",
+                    "grades"
                 ],
                 "title": "EvaluationResponse",
                 "description": "A response to an inline evaluation."
@@ -9313,14 +9332,14 @@
                 "properties": {
                     "dataset_id": {
                         "type": "string",
-                        "description": "The ID of the dataset to be used to run the benchmark."
+                        "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
                     },
                     "grader_ids": {
                         "type": "array",
                         "items": {
                             "type": "string"
                         },
-                        "description": "List of grader ids to use for this benchmark."
+                        "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
                     },
                     "benchmark_id": {
                         "type": "string",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 402106208..5d5b323be 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -5328,7 +5328,7 @@ components:
     EvaluationResponse:
       type: object
       properties:
-        generations:
+        result_rows:
           type: array
           items:
             type: object
@@ -5341,17 +5341,22 @@ components:
                 - type: array
                 - type: object
           description: >-
-            The generations in rows for the evaluation.
-        scores:
+            The result data containing inputs, generations and grades in each row.
+        grades:
           type: object
           additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            The scores for the evaluation. Map of grader id to ScoringResult.
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Map of grader id to aggregated value.
       additionalProperties: false
       required:
-        - generations
-        - scores
+        - result_rows
+        - grades
       title: EvaluationResponse
       description: A response to an inline evaluation.
     ScoringResult:
@@ -6404,13 +6409,14 @@ components:
         dataset_id:
           type: string
           description: >-
-            The ID of the dataset to be used to run the benchmark.
+            The ID of the dataset to be used to run the benchmark. ID obtained through
+            `datasets.register()`
         grader_ids:
           type: array
           items:
             type: string
           description: >-
-            List of grader ids to use for this benchmark.
+            List of grader ids to use for this benchmark. ID obtained through `graders.register()`
         benchmark_id:
           type: string
           description: >-
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 3c5624e62..534aa6884 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -62,8 +62,8 @@ class Benchmarks(Protocol):
         """
         Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
 
-        :param dataset_id: The ID of the dataset to be used to run the benchmark.
-        :param grader_ids: List of grader ids to use for this benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
+        :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
         :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
         :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
         """
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 269004b26..bde27e0be 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
     candidate: EvaluationCandidate
 
 
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
-    :param metrics: Map of metric name to aggregated value.
-    """
-
-    scores: List[Dict[str, Any]]
-    metrics: Dict[str, Any]
-
-
 @json_schema_type
 class EvaluationResponse(BaseModel):
     """
     A response to an inline evaluation.
 
-    :param generations: The generations in rows for the evaluation.
-    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    :param result_rows: The result data containing inputs, generations and grades in each row.
+    :param grades: Map of grader id to aggregated value.
     """
 
-    generations: List[Dict[str, Any]]
-    scores: Dict[str, ScoringResult]
+    result_rows: List[Dict[str, Any]]
+    grades: Dict[str, Any]
 
 
 class Evaluation(Protocol):