diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 57f37255b..16c21cbb1 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2035,6 +2035,49 @@
]
}
},
+ "/v1/evaluation/grading": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The evaluation job containing grader scores.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluationJob"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Evaluation"
+ ],
+ "description": "Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/GradeRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/evaluation/grade_sync": {
"post": {
"responses": {
@@ -2078,49 +2121,6 @@
}
}
},
- "/v1/evaluation/grading": {
- "post": {
- "responses": {
- "200": {
- "description": "The evaluation job containing grader scores.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluationJob"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Evaluation"
- ],
- "description": "Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/GradingRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/health": {
"get": {
"responses": {
@@ -8615,7 +8615,7 @@
}
}
},
- "GradeSyncRequest": {
+ "GradeRequest": {
"type": "object",
"properties": {
"task": {
@@ -8627,69 +8627,7 @@
"required": [
"task"
],
- "title": "GradeSyncRequest"
- },
- "EvaluationResponse": {
- "type": "object",
- "properties": {
- "generations": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The generations in rows for the evaluation."
- },
- "scores": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- },
- "description": "The scores for the evaluation. Map of grader id to ScoringResult."
- }
- },
- "additionalProperties": false,
- "required": [
- "generations",
- "scores"
- ],
- "title": "EvaluationResponse",
- "description": "A response to an inline evaluation."
- },
- "GradingRequest": {
- "type": "object",
- "properties": {
- "task": {
- "$ref": "#/components/schemas/EvaluationTask",
- "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
- }
- },
- "additionalProperties": false,
- "required": [
- "task"
- ],
- "title": "GradingRequest"
+ "title": "GradeRequest"
},
"EvaluationCandidate": {
"oneOf": [
@@ -8763,6 +8701,68 @@
],
"title": "EvaluationJob"
},
+ "GradeSyncRequest": {
+ "type": "object",
+ "properties": {
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask",
+ "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "task"
+ ],
+ "title": "GradeSyncRequest"
+ },
+ "EvaluationResponse": {
+ "type": "object",
+ "properties": {
+ "generations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The generations in rows for the evaluation."
+ },
+ "scores": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ },
+ "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "generations",
+ "scores"
+ ],
+ "title": "EvaluationResponse",
+ "description": "A response to an inline evaluation."
+ },
"HealthInfo": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 60a8700f7..1711c93ec 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1385,6 +1385,38 @@ paths:
required: true
schema:
type: string
+ /v1/evaluation/grading:
+ post:
+ responses:
+ '200':
+ description: >-
+ The evaluation job containing grader scores.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluationJob'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Evaluation
+ description: >-
+ Schedule a grading job, by grading generated (model or agent) results. The
+ generated results are expected to be in the dataset.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/GradeRequest'
+ required: true
/v1/evaluation/grade_sync:
post:
responses:
@@ -1420,38 +1452,6 @@ paths:
schema:
$ref: '#/components/schemas/GradeSyncRequest'
required: true
- /v1/evaluation/grading:
- post:
- responses:
- '200':
- description: >-
- The evaluation job containing grader scores.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluationJob'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Evaluation
- description: >-
- Schedule a grading job, by grading generated results. The generated results
- are expected to be in the dataset.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/GradingRequest'
- required: true
/v1/health:
get:
responses:
@@ -5966,7 +5966,7 @@ components:
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
dataset: '#/components/schemas/DatasetEvaluationTask'
data: '#/components/schemas/DataEvaluationTask'
- GradeSyncRequest:
+ GradeRequest:
type: object
properties:
task:
@@ -5980,51 +5980,7 @@ components:
additionalProperties: false
required:
- task
- title: GradeSyncRequest
- EvaluationResponse:
- type: object
- properties:
- generations:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The generations in rows for the evaluation.
- scores:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- description: >-
- The scores for the evaluation. Map of grader id to ScoringResult.
- additionalProperties: false
- required:
- - generations
- - scores
- title: EvaluationResponse
- description: A response to an inline evaluation.
- GradingRequest:
- type: object
- properties:
- task:
- $ref: '#/components/schemas/EvaluationTask'
- description: >-
- The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
- task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
- against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
- evaluation task against a data source (e.g. rows, uri, etc.) and a list
- of grader_ids
- additionalProperties: false
- required:
- - task
- title: GradingRequest
+ title: GradeRequest
EvaluationCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
@@ -6078,6 +6034,50 @@ components:
- task
- candidate
title: EvaluationJob
+ GradeSyncRequest:
+ type: object
+ properties:
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ description: >-
+ The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
+ task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
+ against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
+ evaluation task against a data source (e.g. rows, uri, etc.) and a list
+ of grader_ids
+ additionalProperties: false
+ required:
+ - task
+ title: GradeSyncRequest
+ EvaluationResponse:
+ type: object
+ properties:
+ generations:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The generations in rows for the evaluation.
+ scores:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ description: >-
+ The scores for the evaluation. Map of grader id to ScoringResult.
+ additionalProperties: false
+ required:
+ - generations
+ - scores
+ title: EvaluationResponse
+ description: A response to an inline evaluation.
HealthInfo:
type: object
properties:
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 8425000da..6f6a27041 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -148,9 +148,9 @@ class Evaluation(Protocol):
...
@webmethod(route="/evaluation/grading", method="POST")
- async def grading(self, task: EvaluationTask) -> EvaluationJob:
+ async def grade(self, task: EvaluationTask) -> EvaluationJob:
"""
- Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.
+ Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
:param task: The task to evaluate. One of:
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id