diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index fd7d767ae..af1f97ca0 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -7617,7 +7617,7 @@
"EvaluationResponse": {
"type": "object",
"properties": {
- "generations": {
+ "result_rows": {
"type": "array",
"items": {
"type": "object",
@@ -7644,20 +7644,39 @@
]
}
},
- "description": "The generations in rows for the evaluation."
+ "description": "The result data containing inputs, generations and grades in each row."
},
- "scores": {
+ "grades": {
"type": "object",
"additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
},
- "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+ "description": "Map of grader id to aggregated value."
}
},
"additionalProperties": false,
"required": [
- "generations",
- "scores"
+ "result_rows",
+ "grades"
],
"title": "EvaluationResponse",
"description": "A response to an inline evaluation."
@@ -9313,14 +9332,14 @@
"properties": {
"dataset_id": {
"type": "string",
- "description": "The ID of the dataset to be used to run the benchmark."
+ "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
},
"grader_ids": {
"type": "array",
"items": {
"type": "string"
},
- "description": "List of grader ids to use for this benchmark."
+ "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
},
"benchmark_id": {
"type": "string",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 402106208..5d5b323be 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -5328,7 +5328,7 @@ components:
EvaluationResponse:
type: object
properties:
- generations:
+ result_rows:
type: array
items:
type: object
@@ -5341,17 +5341,22 @@ components:
- type: array
- type: object
description: >-
- The generations in rows for the evaluation.
- scores:
+ The result data containing inputs, generations and grades in each row.
+ grades:
type: object
additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- description: >-
- The scores for the evaluation. Map of grader id to ScoringResult.
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: Map of grader id to aggregated value.
additionalProperties: false
required:
- - generations
- - scores
+ - result_rows
+ - grades
title: EvaluationResponse
description: A response to an inline evaluation.
ScoringResult:
@@ -6404,13 +6409,14 @@ components:
dataset_id:
type: string
description: >-
- The ID of the dataset to be used to run the benchmark.
+ The ID of the dataset to be used to run the benchmark. ID obtained through
+ `datasets.register()`
grader_ids:
type: array
items:
type: string
description: >-
- List of grader ids to use for this benchmark.
+ List of grader ids to use for this benchmark. ID obtained through `graders.register()`
benchmark_id:
type: string
description: >-
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 3c5624e62..534aa6884 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -62,8 +62,8 @@ class Benchmarks(Protocol):
"""
Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
- :param dataset_id: The ID of the dataset to be used to run the benchmark.
- :param grader_ids: List of grader ids to use for this benchmark.
+ :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
+ :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
"""
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 269004b26..bde27e0be 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
candidate: EvaluationCandidate
-@json_schema_type
-class ScoringResult(BaseModel):
- """
- A scoring result for a single row.
-
- :param scores: The scoring result for each row. Each row is a map of grader column name to value.
- :param metrics: Map of metric name to aggregated value.
- """
-
- scores: List[Dict[str, Any]]
- metrics: Dict[str, Any]
-
-
@json_schema_type
class EvaluationResponse(BaseModel):
"""
A response to an inline evaluation.
- :param generations: The generations in rows for the evaluation.
- :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+ :param result_rows: The result data containing inputs, generations and grades in each row.
+ :param grades: Map of grader id to aggregated value.
"""
- generations: List[Dict[str, Any]]
- scores: Dict[str, ScoringResult]
+ result_rows: List[Dict[str, Any]]
+ grades: Dict[str, Any]
class Evaluation(Protocol):