mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-06 18:50:44 +00:00
Merge branch 'eval_api_final' into delete_eval_scoring_scoring_fn
This commit is contained in:
commit
e23531c9d0
4 changed files with 50 additions and 38 deletions
37
docs/_static/llama-stack-spec.html
vendored
37
docs/_static/llama-stack-spec.html
vendored
|
@ -7617,7 +7617,7 @@
|
||||||
"EvaluationResponse": {
|
"EvaluationResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"generations": {
|
"result_rows": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -7644,20 +7644,39 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"description": "The generations in rows for the evaluation."
|
"description": "The result data containing inputs, generations and grades in each row."
|
||||||
},
|
},
|
||||||
"scores": {
|
"grades": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"additionalProperties": {
|
"additionalProperties": {
|
||||||
"$ref": "#/components/schemas/ScoringResult"
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
},
|
},
|
||||||
"description": "The scores for the evaluation. Map of grader id to ScoringResult."
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "Map of grader id to aggregated value."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"generations",
|
"result_rows",
|
||||||
"scores"
|
"grades"
|
||||||
],
|
],
|
||||||
"title": "EvaluationResponse",
|
"title": "EvaluationResponse",
|
||||||
"description": "A response to an inline evaluation."
|
"description": "A response to an inline evaluation."
|
||||||
|
@ -9313,14 +9332,14 @@
|
||||||
"properties": {
|
"properties": {
|
||||||
"dataset_id": {
|
"dataset_id": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The ID of the dataset to be used to run the benchmark."
|
"description": "The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`"
|
||||||
},
|
},
|
||||||
"grader_ids": {
|
"grader_ids": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"description": "List of grader ids to use for this benchmark."
|
"description": "List of grader ids to use for this benchmark. ID obtained through `graders.register()`"
|
||||||
},
|
},
|
||||||
"benchmark_id": {
|
"benchmark_id": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
|
|
26
docs/_static/llama-stack-spec.yaml
vendored
26
docs/_static/llama-stack-spec.yaml
vendored
|
@ -5328,7 +5328,7 @@ components:
|
||||||
EvaluationResponse:
|
EvaluationResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
generations:
|
result_rows:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: object
|
type: object
|
||||||
|
@ -5341,17 +5341,22 @@ components:
|
||||||
- type: array
|
- type: array
|
||||||
- type: object
|
- type: object
|
||||||
description: >-
|
description: >-
|
||||||
The generations in rows for the evaluation.
|
The result data containing inputs, generations and grades in each row.
|
||||||
scores:
|
grades:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
$ref: '#/components/schemas/ScoringResult'
|
oneOf:
|
||||||
description: >-
|
- type: 'null'
|
||||||
The scores for the evaluation. Map of grader id to ScoringResult.
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: Map of grader id to aggregated value.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- generations
|
- result_rows
|
||||||
- scores
|
- grades
|
||||||
title: EvaluationResponse
|
title: EvaluationResponse
|
||||||
description: A response to an inline evaluation.
|
description: A response to an inline evaluation.
|
||||||
ScoringResult:
|
ScoringResult:
|
||||||
|
@ -6404,13 +6409,14 @@ components:
|
||||||
dataset_id:
|
dataset_id:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
The ID of the dataset to be used to run the benchmark.
|
The ID of the dataset to be used to run the benchmark. ID obtained through
|
||||||
|
`datasets.register()`
|
||||||
grader_ids:
|
grader_ids:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
List of grader ids to use for this benchmark.
|
List of grader ids to use for this benchmark. ID obtained through `graders.register()`
|
||||||
benchmark_id:
|
benchmark_id:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
|
|
|
@ -62,8 +62,8 @@ class Benchmarks(Protocol):
|
||||||
"""
|
"""
|
||||||
Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
|
Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
|
||||||
|
|
||||||
:param dataset_id: The ID of the dataset to be used to run the benchmark.
|
:param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
|
||||||
:param grader_ids: List of grader ids to use for this benchmark.
|
:param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
|
||||||
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
|
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
|
||||||
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
|
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
|
||||||
candidate: EvaluationCandidate
|
candidate: EvaluationCandidate
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class ScoringResult(BaseModel):
|
|
||||||
"""
|
|
||||||
A scoring result for a single row.
|
|
||||||
|
|
||||||
:param scores: The scoring result for each row. Each row is a map of grader column name to value.
|
|
||||||
:param metrics: Map of metric name to aggregated value.
|
|
||||||
"""
|
|
||||||
|
|
||||||
scores: List[Dict[str, Any]]
|
|
||||||
metrics: Dict[str, Any]
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class EvaluationResponse(BaseModel):
|
class EvaluationResponse(BaseModel):
|
||||||
"""
|
"""
|
||||||
A response to an inline evaluation.
|
A response to an inline evaluation.
|
||||||
|
|
||||||
:param generations: The generations in rows for the evaluation.
|
:param result_rows: The result data containing inputs, generations and grades in each row.
|
||||||
:param scores: The scores for the evaluation. Map of grader id to ScoringResult.
|
:param grades: Map of grader id to aggregated value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
generations: List[Dict[str, Any]]
|
result_rows: List[Dict[str, Any]]
|
||||||
scores: Dict[str, ScoringResult]
|
grades: Dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
class Evaluation(Protocol):
|
class Evaluation(Protocol):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue