diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 16c21cbb1..e9429a0c0 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2035,7 +2035,7 @@
]
}
},
- "/v1/evaluation/grading": {
+ "/v1/evaluation/grade": {
"post": {
"responses": {
"200": {
@@ -8523,32 +8523,14 @@
],
"title": "VectorDB"
},
- "BenchmarkEvaluationTask": {
+ "EvaluationTask": {
"type": "object",
"properties": {
- "type": {
- "type": "string",
- "const": "benchmark",
- "default": "benchmark"
- },
"benchmark_id": {
"type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "benchmark_id"
- ],
- "title": "BenchmarkEvaluationTask"
- },
- "DataEvaluationTask": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "data",
- "default": "data"
+ },
+ "dataset_id": {
+ "type": "string"
},
"data_source": {
"$ref": "#/components/schemas/DataSource"
@@ -8561,66 +8543,14 @@
}
},
"additionalProperties": false,
- "required": [
- "type",
- "data_source",
- "grader_ids"
- ],
- "title": "DataEvaluationTask"
- },
- "DatasetEvaluationTask": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "dataset",
- "default": "dataset"
- },
- "dataset_id": {
- "type": "string"
- },
- "grader_ids": {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "dataset_id",
- "grader_ids"
- ],
- "title": "DatasetEvaluationTask"
- },
- "EvaluationTask": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/BenchmarkEvaluationTask"
- },
- {
- "$ref": "#/components/schemas/DatasetEvaluationTask"
- },
- {
- "$ref": "#/components/schemas/DataEvaluationTask"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "benchmark": "#/components/schemas/BenchmarkEvaluationTask",
- "dataset": "#/components/schemas/DatasetEvaluationTask",
- "data": "#/components/schemas/DataEvaluationTask"
- }
- }
+ "title": "EvaluationTask"
},
"GradeRequest": {
"type": "object",
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
- "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}
},
"additionalProperties": false,
@@ -8706,7 +8636,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
- "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
}
},
"additionalProperties": false,
@@ -10737,7 +10667,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
- "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
},
"candidate": {
"$ref": "#/components/schemas/EvaluationCandidate",
@@ -10839,7 +10769,7 @@
"properties": {
"task": {
"$ref": "#/components/schemas/EvaluationTask",
- "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
},
"candidate": {
"$ref": "#/components/schemas/EvaluationCandidate",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1711c93ec..8de434ba7 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1385,7 +1385,7 @@ paths:
required: true
schema:
type: string
- /v1/evaluation/grading:
+ /v1/evaluation/grade:
post:
responses:
'200':
@@ -5903,27 +5903,13 @@ components:
- embedding_model
- embedding_dimension
title: VectorDB
- BenchmarkEvaluationTask:
+ EvaluationTask:
type: object
properties:
- type:
- type: string
- const: benchmark
- default: benchmark
benchmark_id:
type: string
- additionalProperties: false
- required:
- - type
- - benchmark_id
- title: BenchmarkEvaluationTask
- DataEvaluationTask:
- type: object
- properties:
- type:
+ dataset_id:
type: string
- const: data
- default: data
data_source:
$ref: '#/components/schemas/DataSource'
grader_ids:
@@ -5931,52 +5917,18 @@ components:
items:
type: string
additionalProperties: false
- required:
- - type
- - data_source
- - grader_ids
- title: DataEvaluationTask
- DatasetEvaluationTask:
- type: object
- properties:
- type:
- type: string
- const: dataset
- default: dataset
- dataset_id:
- type: string
- grader_ids:
- type: array
- items:
- type: string
- additionalProperties: false
- required:
- - type
- - dataset_id
- - grader_ids
- title: DatasetEvaluationTask
- EvaluationTask:
- oneOf:
- - $ref: '#/components/schemas/BenchmarkEvaluationTask'
- - $ref: '#/components/schemas/DatasetEvaluationTask'
- - $ref: '#/components/schemas/DataEvaluationTask'
- discriminator:
- propertyName: type
- mapping:
- benchmark: '#/components/schemas/BenchmarkEvaluationTask'
- dataset: '#/components/schemas/DatasetEvaluationTask'
- data: '#/components/schemas/DataEvaluationTask'
+ title: EvaluationTask
GradeRequest:
type: object
properties:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
- The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
- task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
- against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
- evaluation task against a data source (e.g. rows, uri, etc.) and a list
- of grader_ids
+ The task to evaluate. To specify a task, one of the following must be
+ provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+ and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+ task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false
required:
- task
@@ -6040,11 +5992,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
- The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
- task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
- against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
- evaluation task against a data source (e.g. rows, uri, etc.) and a list
- of grader_ids
+ The task to evaluate. To specify a task, one of the following must be
+ provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+ and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+ task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
additionalProperties: false
required:
- task
@@ -7359,11 +7311,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
- The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
- task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
- against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
- evaluation task against a data source (e.g. rows, uri, etc.) and a list
- of grader_ids
+ The task to evaluate. To specify a task, one of the following must be
+ provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+ and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+ task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate:
$ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate.
@@ -7429,11 +7381,11 @@ components:
task:
$ref: '#/components/schemas/EvaluationTask'
description: >-
- The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
- task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
- against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
- evaluation task against a data source (e.g. rows, uri, etc.) and a list
- of grader_ids
+ The task to evaluate. To specify a task, one of the following must be
+ provided: - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
+ and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
+ task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
candidate:
$ref: '#/components/schemas/EvaluationCandidate'
description: The candidate to evaluate.
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index 6f6a27041..1911b567b 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(
@json_schema_type
-class BenchmarkEvaluationTask(BaseModel):
- type: Literal["benchmark"] = "benchmark"
- benchmark_id: str
-
-
-@json_schema_type
-class DatasetEvaluationTask(BaseModel):
- type: Literal["dataset"] = "dataset"
- dataset_id: str
- grader_ids: List[str]
-
-
-@json_schema_type
-class DataEvaluationTask(BaseModel):
- type: Literal["data"] = "data"
- data_source: DataSource
- grader_ids: List[str]
-
-
-EvaluationTask = register_schema(
- Annotated[
- Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
- Field(discriminator="type"),
- ],
- name="EvaluationTask",
-)
+class EvaluationTask(BaseModel):
+ benchmark_id: Optional[str] = None
+ dataset_id: Optional[str] = None
+ data_source: Optional[DataSource] = None
+ grader_ids: Optional[List[str]] = None
@json_schema_type
@@ -121,10 +100,10 @@ class Evaluation(Protocol):
"""
Schedule a full evaluation job, by generating results using candidate and grading them.
- :param task: The task to evaluate. One of:
- - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param task: The task to evaluate. To specify a task, one of the following must be provided:
+ - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+ - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate.
"""
...
@@ -139,23 +118,23 @@ class Evaluation(Protocol):
Run an evaluation synchronously, i.e., without scheduling a job".
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
- :param task: The task to evaluate. One of:
- - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param task: The task to evaluate. To specify a task, one of the following must be provided:
+ - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+ - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:param candidate: The candidate to evaluate.
"""
...
- @webmethod(route="/evaluation/grading", method="POST")
+ @webmethod(route="/evaluation/grade", method="POST")
async def grade(self, task: EvaluationTask) -> EvaluationJob:
"""
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
- :param task: The task to evaluate. One of:
- - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param task: The task to evaluate. To specify a task, one of the following must be provided:
+ - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+ - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores.
"""
@@ -167,10 +146,10 @@ class Evaluation(Protocol):
Run grading synchronously on generated results, i.e., without scheduling a job.
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
- :param task: The task to evaluate. One of:
- - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
- - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
- - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param task: The task to evaluate. To specify a task, one of the following must be provided:
+ - `benchmark_id`: Run evaluation task against a benchmark_id
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
+ - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
:return: The evaluation job containing grader scores. "generations" is not populated in the response.
"""