forked from phoenix-oss/llama-stack-mirror
update EvaluationTask
This commit is contained in:
parent
5e817cd56a
commit
f107e3229b
3 changed files with 56 additions and 195 deletions
90
docs/_static/llama-stack-spec.html
vendored
90
docs/_static/llama-stack-spec.html
vendored
|
@ -2035,7 +2035,7 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/evaluation/grading": {
|
"/v1/evaluation/grade": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -8523,32 +8523,14 @@
|
||||||
],
|
],
|
||||||
"title": "VectorDB"
|
"title": "VectorDB"
|
||||||
},
|
},
|
||||||
"BenchmarkEvaluationTask": {
|
"EvaluationTask": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "benchmark",
|
|
||||||
"default": "benchmark"
|
|
||||||
},
|
|
||||||
"benchmark_id": {
|
"benchmark_id": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
}
|
},
|
||||||
},
|
"dataset_id": {
|
||||||
"additionalProperties": false,
|
"type": "string"
|
||||||
"required": [
|
|
||||||
"type",
|
|
||||||
"benchmark_id"
|
|
||||||
],
|
|
||||||
"title": "BenchmarkEvaluationTask"
|
|
||||||
},
|
|
||||||
"DataEvaluationTask": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "data",
|
|
||||||
"default": "data"
|
|
||||||
},
|
},
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"$ref": "#/components/schemas/DataSource"
|
"$ref": "#/components/schemas/DataSource"
|
||||||
|
@ -8561,66 +8543,14 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"title": "EvaluationTask"
|
||||||
"type",
|
|
||||||
"data_source",
|
|
||||||
"grader_ids"
|
|
||||||
],
|
|
||||||
"title": "DataEvaluationTask"
|
|
||||||
},
|
|
||||||
"DatasetEvaluationTask": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "dataset",
|
|
||||||
"default": "dataset"
|
|
||||||
},
|
|
||||||
"dataset_id": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"grader_ids": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"type",
|
|
||||||
"dataset_id",
|
|
||||||
"grader_ids"
|
|
||||||
],
|
|
||||||
"title": "DatasetEvaluationTask"
|
|
||||||
},
|
|
||||||
"EvaluationTask": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/BenchmarkEvaluationTask"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/DatasetEvaluationTask"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/DataEvaluationTask"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"discriminator": {
|
|
||||||
"propertyName": "type",
|
|
||||||
"mapping": {
|
|
||||||
"benchmark": "#/components/schemas/BenchmarkEvaluationTask",
|
|
||||||
"dataset": "#/components/schemas/DatasetEvaluationTask",
|
|
||||||
"data": "#/components/schemas/DataEvaluationTask"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"GradeRequest": {
|
"GradeRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -8706,7 +8636,7 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -10737,7 +10667,7 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
},
|
},
|
||||||
"candidate": {
|
"candidate": {
|
||||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||||
|
@ -10839,7 +10769,7 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"task": {
|
"task": {
|
||||||
"$ref": "#/components/schemas/EvaluationTask",
|
"$ref": "#/components/schemas/EvaluationTask",
|
||||||
"description": "The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
|
||||||
},
|
},
|
||||||
"candidate": {
|
"candidate": {
|
||||||
"$ref": "#/components/schemas/EvaluationCandidate",
|
"$ref": "#/components/schemas/EvaluationCandidate",
|
||||||
|
|
96
docs/_static/llama-stack-spec.yaml
vendored
96
docs/_static/llama-stack-spec.yaml
vendored
|
@ -1385,7 +1385,7 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
/v1/evaluation/grading:
|
/v1/evaluation/grade:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -5903,27 +5903,13 @@ components:
|
||||||
- embedding_model
|
- embedding_model
|
||||||
- embedding_dimension
|
- embedding_dimension
|
||||||
title: VectorDB
|
title: VectorDB
|
||||||
BenchmarkEvaluationTask:
|
EvaluationTask:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: benchmark
|
|
||||||
default: benchmark
|
|
||||||
benchmark_id:
|
benchmark_id:
|
||||||
type: string
|
type: string
|
||||||
additionalProperties: false
|
dataset_id:
|
||||||
required:
|
|
||||||
- type
|
|
||||||
- benchmark_id
|
|
||||||
title: BenchmarkEvaluationTask
|
|
||||||
DataEvaluationTask:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
type:
|
|
||||||
type: string
|
type: string
|
||||||
const: data
|
|
||||||
default: data
|
|
||||||
data_source:
|
data_source:
|
||||||
$ref: '#/components/schemas/DataSource'
|
$ref: '#/components/schemas/DataSource'
|
||||||
grader_ids:
|
grader_ids:
|
||||||
|
@ -5931,52 +5917,18 @@ components:
|
||||||
items:
|
items:
|
||||||
type: string
|
type: string
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
title: EvaluationTask
|
||||||
- type
|
|
||||||
- data_source
|
|
||||||
- grader_ids
|
|
||||||
title: DataEvaluationTask
|
|
||||||
DatasetEvaluationTask:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: dataset
|
|
||||||
default: dataset
|
|
||||||
dataset_id:
|
|
||||||
type: string
|
|
||||||
grader_ids:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: string
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- type
|
|
||||||
- dataset_id
|
|
||||||
- grader_ids
|
|
||||||
title: DatasetEvaluationTask
|
|
||||||
EvaluationTask:
|
|
||||||
oneOf:
|
|
||||||
- $ref: '#/components/schemas/BenchmarkEvaluationTask'
|
|
||||||
- $ref: '#/components/schemas/DatasetEvaluationTask'
|
|
||||||
- $ref: '#/components/schemas/DataEvaluationTask'
|
|
||||||
discriminator:
|
|
||||||
propertyName: type
|
|
||||||
mapping:
|
|
||||||
benchmark: '#/components/schemas/BenchmarkEvaluationTask'
|
|
||||||
dataset: '#/components/schemas/DatasetEvaluationTask'
|
|
||||||
data: '#/components/schemas/DataEvaluationTask'
|
|
||||||
GradeRequest:
|
GradeRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
The task to evaluate. To specify a task, one of the following must be
|
||||||
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
|
||||||
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
|
||||||
of grader_ids
|
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- task
|
- task
|
||||||
|
@ -6040,11 +5992,11 @@ components:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
The task to evaluate. To specify a task, one of the following must be
|
||||||
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
|
||||||
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
|
||||||
of grader_ids
|
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- task
|
- task
|
||||||
|
@ -7359,11 +7311,11 @@ components:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
The task to evaluate. To specify a task, one of the following must be
|
||||||
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
|
||||||
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
|
||||||
of grader_ids
|
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
candidate:
|
candidate:
|
||||||
$ref: '#/components/schemas/EvaluationCandidate'
|
$ref: '#/components/schemas/EvaluationCandidate'
|
||||||
description: The candidate to evaluate.
|
description: The candidate to evaluate.
|
||||||
|
@ -7429,11 +7381,11 @@ components:
|
||||||
task:
|
task:
|
||||||
$ref: '#/components/schemas/EvaluationTask'
|
$ref: '#/components/schemas/EvaluationTask'
|
||||||
description: >-
|
description: >-
|
||||||
The task to evaluate. One of: - BenchmarkEvaluationTask: Run evaluation
|
The task to evaluate. To specify a task, one of the following must be
|
||||||
task against a benchmark_id - DatasetEvaluationTask: Run evaluation task
|
provided: - `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
against a dataset_id and a list of grader_ids - DataEvaluationTask: Run
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id
|
||||||
evaluation task against a data source (e.g. rows, uri, etc.) and a list
|
and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation
|
||||||
of grader_ids
|
task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
candidate:
|
candidate:
|
||||||
$ref: '#/components/schemas/EvaluationCandidate'
|
$ref: '#/components/schemas/EvaluationCandidate'
|
||||||
description: The candidate to evaluate.
|
description: The candidate to evaluate.
|
||||||
|
|
|
@ -48,32 +48,11 @@ EvaluationCandidate = register_schema(
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BenchmarkEvaluationTask(BaseModel):
|
class EvaluationTask(BaseModel):
|
||||||
type: Literal["benchmark"] = "benchmark"
|
benchmark_id: Optional[str] = None
|
||||||
benchmark_id: str
|
dataset_id: Optional[str] = None
|
||||||
|
data_source: Optional[DataSource] = None
|
||||||
|
grader_ids: Optional[List[str]] = None
|
||||||
@json_schema_type
|
|
||||||
class DatasetEvaluationTask(BaseModel):
|
|
||||||
type: Literal["dataset"] = "dataset"
|
|
||||||
dataset_id: str
|
|
||||||
grader_ids: List[str]
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class DataEvaluationTask(BaseModel):
|
|
||||||
type: Literal["data"] = "data"
|
|
||||||
data_source: DataSource
|
|
||||||
grader_ids: List[str]
|
|
||||||
|
|
||||||
|
|
||||||
EvaluationTask = register_schema(
|
|
||||||
Annotated[
|
|
||||||
Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
|
|
||||||
Field(discriminator="type"),
|
|
||||||
],
|
|
||||||
name="EvaluationTask",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
@ -121,10 +100,10 @@ class Evaluation(Protocol):
|
||||||
"""
|
"""
|
||||||
Schedule a full evaluation job, by generating results using candidate and grading them.
|
Schedule a full evaluation job, by generating results using candidate and grading them.
|
||||||
|
|
||||||
:param task: The task to evaluate. One of:
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
||||||
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
||||||
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
:param candidate: The candidate to evaluate.
|
:param candidate: The candidate to evaluate.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
@ -139,23 +118,23 @@ class Evaluation(Protocol):
|
||||||
Run an evaluation synchronously, i.e., without scheduling a job".
|
Run an evaluation synchronously, i.e., without scheduling a job".
|
||||||
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
||||||
|
|
||||||
:param task: The task to evaluate. One of:
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
||||||
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
||||||
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
:param candidate: The candidate to evaluate.
|
:param candidate: The candidate to evaluate.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
@webmethod(route="/evaluation/grading", method="POST")
|
@webmethod(route="/evaluation/grade", method="POST")
|
||||||
async def grade(self, task: EvaluationTask) -> EvaluationJob:
|
async def grade(self, task: EvaluationTask) -> EvaluationJob:
|
||||||
"""
|
"""
|
||||||
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
|
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
|
||||||
|
|
||||||
:param task: The task to evaluate. One of:
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
||||||
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
||||||
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
|
|
||||||
:return: The evaluation job containing grader scores.
|
:return: The evaluation job containing grader scores.
|
||||||
"""
|
"""
|
||||||
|
@ -167,10 +146,10 @@ class Evaluation(Protocol):
|
||||||
Run grading synchronously on generated results, i.e., without scheduling a job.
|
Run grading synchronously on generated results, i.e., without scheduling a job.
|
||||||
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
||||||
|
|
||||||
:param task: The task to evaluate. One of:
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
||||||
- BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
||||||
- DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
||||||
- DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
||||||
|
|
||||||
:return: The evaluation job containing grader scores. "generations" is not populated in the response.
|
:return: The evaluation job containing grader scores. "generations" is not populated in the response.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue