From a92756a4b7b53a9975f085ccdf3dd1a4db04a676 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 22:09:35 -0700 Subject: [PATCH] result_data in evaluation response --- docs/_static/llama-stack-spec.html | 33 ++++++++++++++++++----- docs/_static/llama-stack-spec.yaml | 21 +++++++++------ llama_stack/apis/evaluation/evaluation.py | 21 +++------------ 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 0f223b51b..2a294ea11 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8653,7 +8653,7 @@ "EvaluationResponse": { "type": "object", "properties": { - "generations": { + "result_data": { "type": "array", "items": { "type": "object", @@ -8680,20 +8680,39 @@ ] } }, - "description": "The generations in rows for the evaluation." + "description": "The result data containing generations and grades in each row." }, - "scores": { + "metrics": { "type": "object", "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] }, - "description": "The scores for the evaluation. Map of grader id to ScoringResult." + "description": "Map of metric name to aggregated value." } }, "additionalProperties": false, "required": [ - "generations", - "scores" + "result_data", + "metrics" ], "title": "EvaluationResponse", "description": "A response to an inline evaluation." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 7c4ea81b8..7508acd66 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6018,7 +6018,7 @@ components: EvaluationResponse: type: object properties: - generations: + result_data: type: array items: type: object @@ -6031,17 +6031,22 @@ components: - type: array - type: object description: >- - The generations in rows for the evaluation. - scores: + The result data containing generations and grades in each row. + metrics: type: object additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - The scores for the evaluation. Map of grader id to ScoringResult. + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: Map of metric name to aggregated value. additionalProperties: false required: - - generations - - scores + - result_data + - metrics title: EvaluationResponse description: A response to an inline evaluation. HealthInfo: diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 269004b26..8d6fdd201 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields): candidate: EvaluationCandidate -@json_schema_type -class ScoringResult(BaseModel): - """ - A scoring result for a single row. - - :param scores: The scoring result for each row. Each row is a map of grader column name to value. - :param metrics: Map of metric name to aggregated value. - """ - - scores: List[Dict[str, Any]] - metrics: Dict[str, Any] - - @json_schema_type class EvaluationResponse(BaseModel): """ A response to an inline evaluation. - :param generations: The generations in rows for the evaluation. - :param scores: The scores for the evaluation. Map of grader id to ScoringResult. + :param result_data: The result data containing generations and grades in each row. + :param metrics: Map of metric name to aggregated value. """ - generations: List[Dict[str, Any]] - scores: Dict[str, ScoringResult] + result_data: List[Dict[str, Any]] + metrics: Dict[str, Any] class Evaluation(Protocol):