diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 16c21cbb1..e9429a0c0 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2035,7 +2035,7 @@ ] } }, - "/v1/evaluation/grading": { + "/v1/evaluation/grade": { "post": { "responses": { "200": { @@ -8523,32 +8523,14 @@ ], "title": "VectorDB" }, - "BenchmarkEvaluationTask": { + "EvaluationTask": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "benchmark", - "default": "benchmark" - }, "benchmark_id": { "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "type", - "benchmark_id" - ], - "title": "BenchmarkEvaluationTask" - }, - "DataEvaluationTask": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "data", - "default": "data" + }, + "dataset_id": { + "type": "string" }, "data_source": { "$ref": "#/components/schemas/DataSource" @@ -8561,66 +8543,14 @@ } }, "additionalProperties": false, - "required": [ - "type", - "data_source", - "grader_ids" - ], - "title": "DataEvaluationTask" - }, - "DatasetEvaluationTask": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "dataset", - "default": "dataset" - }, - "dataset_id": { - "type": "string" - }, - "grader_ids": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "dataset_id", - "grader_ids" - ], - "title": "DatasetEvaluationTask" - }, - "EvaluationTask": { - "oneOf": [ - { - "$ref": "#/components/schemas/BenchmarkEvaluationTask" - }, - { - "$ref": "#/components/schemas/DatasetEvaluationTask" - }, - { - "$ref": "#/components/schemas/DataEvaluationTask" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "benchmark": "#/components/schemas/BenchmarkEvaluationTask", - "dataset": "#/components/schemas/DatasetEvaluationTask", - "data": "#/components/schemas/DataEvaluationTask" - } - } + "title": "EvaluationTask" }, "GradeRequest": { "type": "object", "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" } }, "additionalProperties": false, @@ -8706,7 +8636,7 @@ "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" } }, "additionalProperties": false, @@ -10737,7 +10667,7 @@ "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" }, "candidate": { "$ref": "#/components/schemas/EvaluationCandidate", @@ -10839,7 +10769,7 @@ "properties": { "task": { "$ref": "#/components/schemas/EvaluationTask", - "description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" }, "candidate": { "$ref": "#/components/schemas/EvaluationCandidate", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1711c93ec..8de434ba7 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1385,7 +1385,7 @@ paths: required: true schema: type: string - /v1/evaluation/grading: + /v1/evaluation/grade: post: responses: '200': @@ -5903,27 +5903,13 @@ components: - embedding_model - embedding_dimension title: VectorDB - BenchmarkEvaluationTask: + EvaluationTask: type: object properties: - type: - type: string - const: benchmark - default: benchmark benchmark_id: type: string - additionalProperties: false - required: - - type - - benchmark_id - title: BenchmarkEvaluationTask - DataEvaluationTask: - type: object - properties: - type: + dataset_id: type: string - const: data - default: data data_source: $ref: '#/components/schemas/DataSource' grader_ids: @@ -5931,52 +5917,18 @@ components: items: type: string additionalProperties: false - required: - - type - - data_source - - grader_ids - title: DataEvaluationTask - DatasetEvaluationTask: - type: object - properties: - type: - type: string - const: dataset - default: dataset - dataset_id: - type: string - grader_ids: - type: array - items: - type: string - additionalProperties: false - required: - - type - - dataset_id - - grader_ids - title: DatasetEvaluationTask - EvaluationTask: - oneOf: - - $ref: '#/components/schemas/BenchmarkEvaluationTask' - - $ref: '#/components/schemas/DatasetEvaluationTask' - - $ref: '#/components/schemas/DataEvaluationTask' - discriminator: - propertyName: type - mapping: - benchmark: '#/components/schemas/BenchmarkEvaluationTask' - dataset: '#/components/schemas/DatasetEvaluationTask' - data: '#/components/schemas/DataEvaluationTask' + title: EvaluationTask GradeRequest: type: object properties: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation - task against a benchmark_id - DatasetEvaluationTask: Run evaluation task - against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - evaluation task against a data source (e.g. rows, uri, etc.) and a list - of grader_ids + The task to evaluate. To specify a task, one of the following must be + provided: - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id + and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation + task against a data source (e.g. rows, uri, etc.) and a list of grader_ids additionalProperties: false required: - task @@ -6040,11 +5992,11 @@ components: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation - task against a benchmark_id - DatasetEvaluationTask: Run evaluation task - against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - evaluation task against a data source (e.g. rows, uri, etc.) and a list - of grader_ids + The task to evaluate. To specify a task, one of the following must be + provided: - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id + and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation + task against a data source (e.g. rows, uri, etc.) and a list of grader_ids additionalProperties: false required: - task @@ -7359,11 +7311,11 @@ components: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation - task against a benchmark_id - DatasetEvaluationTask: Run evaluation task - against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - evaluation task against a data source (e.g. rows, uri, etc.) and a list - of grader_ids + The task to evaluate. To specify a task, one of the following must be + provided: - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id + and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation + task against a data source (e.g. rows, uri, etc.) and a list of grader_ids candidate: $ref: '#/components/schemas/EvaluationCandidate' description: The candidate to evaluate. @@ -7429,11 +7381,11 @@ components: task: $ref: '#/components/schemas/EvaluationTask' description: >- - The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation - task against a benchmark_id - DatasetEvaluationTask: Run evaluation task - against a dataset_id and a list of grader_ids - DataEvaluationTask: Run - evaluation task against a data source (e.g. rows, uri, etc.) and a list - of grader_ids + The task to evaluate. To specify a task, one of the following must be + provided: - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id + and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation + task against a data source (e.g. rows, uri, etc.) and a list of grader_ids candidate: $ref: '#/components/schemas/EvaluationCandidate' description: The candidate to evaluate. diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index 6f6a27041..1911b567b 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -48,32 +48,11 @@ EvaluationCandidate = register_schema( @json_schema_type -class BenchmarkEvaluationTask(BaseModel): - type: Literal["benchmark"] = "benchmark" - benchmark_id: str - - -@json_schema_type -class DatasetEvaluationTask(BaseModel): - type: Literal["dataset"] = "dataset" - dataset_id: str - grader_ids: List[str] - - -@json_schema_type -class DataEvaluationTask(BaseModel): - type: Literal["data"] = "data" - data_source: DataSource - grader_ids: List[str] - - -EvaluationTask = register_schema( - Annotated[ - Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask], - Field(discriminator="type"), - ], - name="EvaluationTask", -) +class EvaluationTask(BaseModel): + benchmark_id: Optional[str] = None + dataset_id: Optional[str] = None + data_source: Optional[DataSource] = None + grader_ids: Optional[List[str]] = None @json_schema_type @@ -121,10 +100,10 @@ class Evaluation(Protocol): """ Schedule a full evaluation job, by generating results using candidate and grading them. - :param task: The task to evaluate. One of: - - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param task: The task to evaluate. To specify a task, one of the following must be provided: + - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids + - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids :param candidate: The candidate to evaluate. """ ... @@ -139,23 +118,23 @@ class Evaluation(Protocol): Run an evaluation synchronously, i.e., without scheduling a job". You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted. - :param task: The task to evaluate. One of: - - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param task: The task to evaluate. To specify a task, one of the following must be provided: + - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids + - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids :param candidate: The candidate to evaluate. """ ... - @webmethod(route="/evaluation/grading", method="POST") + @webmethod(route="/evaluation/grade", method="POST") async def grade(self, task: EvaluationTask) -> EvaluationJob: """ Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset. - :param task: The task to evaluate. One of: - - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param task: The task to evaluate. To specify a task, one of the following must be provided: + - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids + - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids :return: The evaluation job containing grader scores. """ @@ -167,10 +146,10 @@ class Evaluation(Protocol): Run grading synchronously on generated results, i.e., without scheduling a job. You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted. - :param task: The task to evaluate. One of: - - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param task: The task to evaluate. To specify a task, one of the following must be provided: + - `benchmark_id`: Run evaluation task against a benchmark_id + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids + - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids :return: The evaluation job containing grader scores. "generations" is not populated in the response. """