From a92756a4b7b53a9975f085ccdf3dd1a4db04a676 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 22:09:35 -0700 Subject: [PATCH 1/3] result_data in evaluation response --- docs/_static/llama-stack-spec.html | 33 ++++++++++++++++++----- docs/_static/llama-stack-spec.yaml | 21 +++++++++------ llama_stack/apis/evaluation/evaluation.py | 21 +++------------ 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 0f223b51b..2a294ea11 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8653,7 +8653,7 @@ "EvaluationResponse": { "type": "object", "properties": { - "generations": { + "result_data": { "type": "array", "items": { "type": "object", @@ -8680,20 +8680,39 @@ ] } }, - "description": "The generations in rows for the evaluation." + "description": "The result data containing generations and grades in each row." }, - "scores": { + "metrics": { "type": "object", "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] }, - "description": "The scores for the evaluation. Map of grader id to ScoringResult." + "description": "Map of metric name to aggregated value." } }, "additionalProperties": false, "required": [ - "generations", - "scores" + "result_data", + "metrics" ], "title": "EvaluationResponse", "description": "A response to an inline evaluation." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 7c4ea81b8..7508acd66 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6018,7 +6018,7 @@ components: EvaluationResponse: type: object properties: - generations: + result_data: type: array items: type: object @@ -6031,17 +6031,22 @@ components: - type: array - type: object description: >- - The generations in rows for the evaluation. - scores: + The result data containing generations and grades in each row. + metrics: type: object additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - The scores for the evaluation. Map of grader id to ScoringResult. + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: Map of metric name to aggregated value. additionalProperties: false required: - - generations - - scores + - result_data + - metrics title: EvaluationResponse description: A response to an inline evaluation. HealthInfo: diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 269004b26..8d6fdd201 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields): candidate: EvaluationCandidate -@json_schema_type -class ScoringResult(BaseModel): - """ - A scoring result for a single row. - - :param scores: The scoring result for each row. Each row is a map of grader column name to value. - :param metrics: Map of metric name to aggregated value. - """ - - scores: List[Dict[str, Any]] - metrics: Dict[str, Any] - - @json_schema_type class EvaluationResponse(BaseModel): """ A response to an inline evaluation. - :param generations: The generations in rows for the evaluation. - :param scores: The scores for the evaluation. Map of grader id to ScoringResult. + :param result_data: The result data containing generations and grades in each row. + :param metrics: Map of metric name to aggregated value. """ - generations: List[Dict[str, Any]] - scores: Dict[str, ScoringResult] + result_data: List[Dict[str, Any]] + metrics: Dict[str, Any] class Evaluation(Protocol): From 42447729e4479d7b30e9653a378dd363f54d0f39 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 19 Mar 2025 09:48:30 -0700 Subject: [PATCH 2/3] update --- docs/_static/llama-stack-spec.html | 12 ++++++------ docs/_static/llama-stack-spec.yaml | 12 ++++++------ llama_stack/apis/evaluation/evaluation.py | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 2a294ea11..15b06257f 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8653,7 +8653,7 @@ "EvaluationResponse": { "type": "object", "properties": { - "result_data": { + "result_rows": { "type": "array", "items": { "type": "object", @@ -8680,9 +8680,9 @@ ] } }, - "description": "The result data containing generations and grades in each row." + "description": "The result data containing inputs, generations and grades in each row." }, - "metrics": { + "grades": { "type": "object", "additionalProperties": { "oneOf": [ @@ -8706,13 +8706,13 @@ } ] }, - "description": "Map of metric name to aggregated value." + "description": "Map of grader id to aggregated value." } }, "additionalProperties": false, "required": [ - "result_data", - "metrics" + "result_rows", + "grades" ], "title": "EvaluationResponse", "description": "A response to an inline evaluation." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 7508acd66..0f83dd3d7 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6018,7 +6018,7 @@ components: EvaluationResponse: type: object properties: - result_data: + result_rows: type: array items: type: object @@ -6031,8 +6031,8 @@ components: - type: array - type: object description: >- - The result data containing generations and grades in each row. - metrics: + The result data containing inputs, generations and grades in each row. + grades: type: object additionalProperties: oneOf: @@ -6042,11 +6042,11 @@ components: - type: string - type: array - type: object - description: Map of metric name to aggregated value. + description: Map of grader id to aggregated value. additionalProperties: false required: - - result_data - - metrics + - result_rows + - grades title: EvaluationResponse description: A response to an inline evaluation. HealthInfo: diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 8d6fdd201..bde27e0be 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -81,12 +81,12 @@ class EvaluationResponse(BaseModel): """ A response to an inline evaluation. - :param result_data: The result data containing generations and grades in each row. - :param metrics: Map of metric name to aggregated value. + :param result_rows: The result data containing inputs, generations and grades in each row. + :param grades: Map of grader id to aggregated value. """ - result_data: List[Dict[str, Any]] - metrics: Dict[str, Any] + result_rows: List[Dict[str, Any]] + grades: Dict[str, Any] class Evaluation(Protocol): From 0048274ec0bbdde9c9bfd478b06fd88e0c06de3a Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 19 Mar 2025 09:49:53 -0700 Subject: [PATCH 3/3] update --- docs/_static/llama-stack-spec.html | 4 ++-- docs/_static/llama-stack-spec.yaml | 5 +++-- llama_stack/apis/benchmarks/benchmarks.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 15b06257f..b1fe8e832 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -10328,14 +10328,14 @@ "properties": { "dataset_id": { "type": "string", - "description": "The ID of the dataset to be used to run the benchmark." + "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`" }, "grader_ids": { "type": "array", "items": { "type": "string" }, - "description": "List of grader ids to use for this benchmark." + "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`" }, "benchmark_id": { "type": "string", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 0f83dd3d7..f91744f98 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -7084,13 +7084,14 @@ components: dataset_id: type: string description: >- - The ID of the dataset to be used to run the benchmark. + The ID of the dataset to be used to run the benchmark. ID obtained through + `datasets.register()` grader_ids: type: array items: type: string description: >- - List of grader ids to use for this benchmark. + List of grader ids to use for this benchmark. ID obtained through `graders.register()` benchmark_id: type: string description: >- diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 3c5624e62..534aa6884 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -62,8 +62,8 @@ class Benchmarks(Protocol): """ Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids. - :param dataset_id: The ID of the dataset to be used to run the benchmark. - :param grader_ids: List of grader ids to use for this benchmark. + :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()` + :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()` :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated. :param metadata: (Optional) Metadata for this benchmark for additional descriptions. """