diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index fd7d767ae..af1f97ca0 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -7617,7 +7617,7 @@ "EvaluationResponse": { "type": "object", "properties": { - "generations": { + "result_rows": { "type": "array", "items": { "type": "object", @@ -7644,20 +7644,39 @@ ] } }, - "description": "The generations in rows for the evaluation." + "description": "The result data containing inputs, generations and grades in each row." }, - "scores": { + "grades": { "type": "object", "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] }, - "description": "The scores for the evaluation. Map of grader id to ScoringResult." + "description": "Map of grader id to aggregated value." } }, "additionalProperties": false, "required": [ - "generations", - "scores" + "result_rows", + "grades" ], "title": "EvaluationResponse", "description": "A response to an inline evaluation." @@ -9313,14 +9332,14 @@ "properties": { "dataset_id": { "type": "string", - "description": "The ID of the dataset to be used to run the benchmark." + "description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`" }, "grader_ids": { "type": "array", "items": { "type": "string" }, - "description": "List of grader ids to use for this benchmark." + "description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`" }, "benchmark_id": { "type": "string", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 402106208..5d5b323be 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -5328,7 +5328,7 @@ components: EvaluationResponse: type: object properties: - generations: + result_rows: type: array items: type: object @@ -5341,17 +5341,22 @@ components: - type: array - type: object description: >- - The generations in rows for the evaluation. - scores: + The result data containing inputs, generations and grades in each row. + grades: type: object additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - The scores for the evaluation. Map of grader id to ScoringResult. + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: Map of grader id to aggregated value. additionalProperties: false required: - - generations - - scores + - result_rows + - grades title: EvaluationResponse description: A response to an inline evaluation. ScoringResult: @@ -6404,13 +6409,14 @@ components: dataset_id: type: string description: >- - The ID of the dataset to be used to run the benchmark. + The ID of the dataset to be used to run the benchmark. ID obtained through + `datasets.register()` grader_ids: type: array items: type: string description: >- - List of grader ids to use for this benchmark. + List of grader ids to use for this benchmark. ID obtained through `graders.register()` benchmark_id: type: string description: >- diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 3c5624e62..534aa6884 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -62,8 +62,8 @@ class Benchmarks(Protocol): """ Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids. - :param dataset_id: The ID of the dataset to be used to run the benchmark. - :param grader_ids: List of grader ids to use for this benchmark. + :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()` + :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()` :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated. :param metadata: (Optional) Metadata for this benchmark for additional descriptions. """ diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 269004b26..bde27e0be 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields): candidate: EvaluationCandidate -@json_schema_type -class ScoringResult(BaseModel): - """ - A scoring result for a single row. - - :param scores: The scoring result for each row. Each row is a map of grader column name to value. - :param metrics: Map of metric name to aggregated value. - """ - - scores: List[Dict[str, Any]] - metrics: Dict[str, Any] - - @json_schema_type class EvaluationResponse(BaseModel): """ A response to an inline evaluation. - :param generations: The generations in rows for the evaluation. - :param scores: The scores for the evaluation. Map of grader id to ScoringResult. + :param result_rows: The result data containing inputs, generations and grades in each row. + :param grades: Map of grader id to aggregated value. """ - generations: List[Dict[str, Any]] - scores: Dict[str, ScoringResult] + result_rows: List[Dict[str, Any]] + grades: Dict[str, Any] class Evaluation(Protocol):