mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 04:50:39 +00:00
eval/scoring/datasetio doc
This commit is contained in:
parent
c30cba9db2
commit
83d78cca9c
5 changed files with 262 additions and 60 deletions
123
docs/_static/llama-stack-spec.html
vendored
123
docs/_static/llama-stack-spec.html
vendored
|
@ -69,11 +69,12 @@
|
|||
"tags": [
|
||||
"DatasetIO"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Get a paginated list of rows from a dataset.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "dataset_id",
|
||||
"in": "query",
|
||||
"description": "The ID of the dataset to get the rows from.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -82,6 +83,7 @@
|
|||
{
|
||||
"name": "rows_in_page",
|
||||
"in": "query",
|
||||
"description": "The number of rows to get per page.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "integer"
|
||||
|
@ -90,6 +92,7 @@
|
|||
{
|
||||
"name": "page_token",
|
||||
"in": "query",
|
||||
"description": "The token to get the next page of rows.",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -98,6 +101,7 @@
|
|||
{
|
||||
"name": "filter_condition",
|
||||
"in": "query",
|
||||
"description": "(Optional) A condition to filter the rows by.",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -896,7 +900,7 @@
|
|||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "EvaluateResponse object containing generations and scores",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -921,11 +925,12 @@
|
|||
"tags": [
|
||||
"Eval"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Evaluate a list of rows on a benchmark.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the benchmark to run the evaluation on.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2121,7 +2126,7 @@
|
|||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "The status of the evaluationjob.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -2153,11 +2158,12 @@
|
|||
"tags": [
|
||||
"Eval"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Get the status of a job.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the benchmark to run the evaluation on.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2166,6 +2172,7 @@
|
|||
{
|
||||
"name": "job_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the job to get the status of.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2194,11 +2201,12 @@
|
|||
"tags": [
|
||||
"Eval"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Cancel a job.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the benchmark to run the evaluation on.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2207,6 +2215,7 @@
|
|||
{
|
||||
"name": "job_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the job to cancel.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2219,7 +2228,7 @@
|
|||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "The result of the job.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -2244,11 +2253,12 @@
|
|||
"tags": [
|
||||
"Eval"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Get the result of a job.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the benchmark to run the evaluation on.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -2257,6 +2267,7 @@
|
|||
{
|
||||
"name": "job_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the job to get the result of.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -3287,7 +3298,7 @@
|
|||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "The job that was created to run the evaluation.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -3312,11 +3323,12 @@
|
|||
"tags": [
|
||||
"Eval"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Run an evaluation on a benchmark.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"description": "The ID of the benchmark to run the evaluation on.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
|
@ -3418,7 +3430,7 @@
|
|||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "ScoreResponse object containing rows and aggregated results",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -3443,7 +3455,7 @@
|
|||
"tags": [
|
||||
"Scoring"
|
||||
],
|
||||
"description": "",
|
||||
"description": "Score a list of rows.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
|
@ -6187,7 +6199,8 @@
|
|||
"default": "agent"
|
||||
},
|
||||
"config": {
|
||||
"$ref": "#/components/schemas/AgentConfig"
|
||||
"$ref": "#/components/schemas/AgentConfig",
|
||||
"description": "The configuration for the agent candidate."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6195,7 +6208,8 @@
|
|||
"type",
|
||||
"config"
|
||||
],
|
||||
"title": "AgentCandidate"
|
||||
"title": "AgentCandidate",
|
||||
"description": "An agent candidate for evaluation."
|
||||
},
|
||||
"AggregationFunctionType": {
|
||||
"type": "string",
|
||||
|
@ -6232,16 +6246,19 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"eval_candidate": {
|
||||
"$ref": "#/components/schemas/EvalCandidate"
|
||||
"$ref": "#/components/schemas/EvalCandidate",
|
||||
"description": "The candidate to evaluate."
|
||||
},
|
||||
"scoring_params": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringFnParams"
|
||||
}
|
||||
},
|
||||
"description": "Map between scoring function id and parameters for each scoring function you want to run"
|
||||
},
|
||||
"num_examples": {
|
||||
"type": "integer"
|
||||
"type": "integer",
|
||||
"description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6249,7 +6266,8 @@
|
|||
"eval_candidate",
|
||||
"scoring_params"
|
||||
],
|
||||
"title": "BenchmarkConfig"
|
||||
"title": "BenchmarkConfig",
|
||||
"description": "A benchmark configuration for evaluation."
|
||||
},
|
||||
"EvalCandidate": {
|
||||
"oneOf": [
|
||||
|
@ -6311,13 +6329,16 @@
|
|||
"default": "model"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The model ID to evaluate."
|
||||
},
|
||||
"sampling_params": {
|
||||
"$ref": "#/components/schemas/SamplingParams"
|
||||
"$ref": "#/components/schemas/SamplingParams",
|
||||
"description": "The sampling parameters for the model."
|
||||
},
|
||||
"system_message": {
|
||||
"$ref": "#/components/schemas/SystemMessage"
|
||||
"$ref": "#/components/schemas/SystemMessage",
|
||||
"description": "(Optional) The system message providing instructions or context to the model."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6326,7 +6347,8 @@
|
|||
"model",
|
||||
"sampling_params"
|
||||
],
|
||||
"title": "ModelCandidate"
|
||||
"title": "ModelCandidate",
|
||||
"description": "A model candidate for evaluation."
|
||||
},
|
||||
"RegexParserScoringFnParams": {
|
||||
"type": "object",
|
||||
|
@ -6405,16 +6427,19 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "The rows to evaluate."
|
||||
},
|
||||
"scoring_functions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"description": "The scoring functions to use for the evaluation."
|
||||
},
|
||||
"benchmark_config": {
|
||||
"$ref": "#/components/schemas/BenchmarkConfig"
|
||||
"$ref": "#/components/schemas/BenchmarkConfig",
|
||||
"description": "The configuration for the benchmark."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6454,13 +6479,15 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "The generations from the evaluation."
|
||||
},
|
||||
"scores": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
}
|
||||
},
|
||||
"description": "The scores from the evaluation."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6468,7 +6495,8 @@
|
|||
"generations",
|
||||
"scores"
|
||||
],
|
||||
"title": "EvaluateResponse"
|
||||
"title": "EvaluateResponse",
|
||||
"description": "The response from an evaluation."
|
||||
},
|
||||
"ScoringResult": {
|
||||
"type": "object",
|
||||
|
@ -6499,7 +6527,8 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "The scoring result for each row. Each row is a map of column name to value."
|
||||
},
|
||||
"aggregated_results": {
|
||||
"type": "object",
|
||||
|
@ -6524,7 +6553,8 @@
|
|||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"description": "Map of metric name to aggregated value"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6532,7 +6562,8 @@
|
|||
"score_rows",
|
||||
"aggregated_results"
|
||||
],
|
||||
"title": "ScoringResult"
|
||||
"title": "ScoringResult",
|
||||
"description": "A scoring result for a single row."
|
||||
},
|
||||
"Session": {
|
||||
"type": "object",
|
||||
|
@ -7021,13 +7052,16 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "The rows in the current page."
|
||||
},
|
||||
"total_count": {
|
||||
"type": "integer"
|
||||
"type": "integer",
|
||||
"description": "The total number of rows in the dataset."
|
||||
},
|
||||
"next_page_token": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The token to get the next page of rows."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -7035,7 +7069,8 @@
|
|||
"rows",
|
||||
"total_count"
|
||||
],
|
||||
"title": "PaginatedRowsResult"
|
||||
"title": "PaginatedRowsResult",
|
||||
"description": "A paginated list of rows from a dataset."
|
||||
},
|
||||
"ScoringFn": {
|
||||
"type": "object",
|
||||
|
@ -9307,7 +9342,8 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"benchmark_config": {
|
||||
"$ref": "#/components/schemas/BenchmarkConfig"
|
||||
"$ref": "#/components/schemas/BenchmarkConfig",
|
||||
"description": "The configuration for the benchmark."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -9444,7 +9480,8 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "The rows to score."
|
||||
},
|
||||
"scoring_functions": {
|
||||
"type": "object",
|
||||
|
@ -9457,7 +9494,8 @@
|
|||
"type": "null"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"description": "The scoring functions to use for the scoring."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -9474,14 +9512,16 @@
|
|||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
}
|
||||
},
|
||||
"description": "A map of scoring function name to ScoringResult."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"results"
|
||||
],
|
||||
"title": "ScoreResponse"
|
||||
"title": "ScoreResponse",
|
||||
"description": "The response from scoring."
|
||||
},
|
||||
"ScoreBatchRequest": {
|
||||
"type": "object",
|
||||
|
@ -9896,7 +9936,8 @@
|
|||
"name": "Datasets"
|
||||
},
|
||||
{
|
||||
"name": "Eval"
|
||||
"name": "Eval",
|
||||
"x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
|
||||
},
|
||||
{
|
||||
"name": "Files (Coming Soon)"
|
||||
|
|
89
docs/_static/llama-stack-spec.yaml
vendored
89
docs/_static/llama-stack-spec.yaml
vendored
|
@ -31,25 +31,32 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- DatasetIO
|
||||
description: ''
|
||||
description: >-
|
||||
Get a paginated list of rows from a dataset.
|
||||
parameters:
|
||||
- name: dataset_id
|
||||
in: query
|
||||
description: >-
|
||||
The ID of the dataset to get the rows from.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: rows_in_page
|
||||
in: query
|
||||
description: The number of rows to get per page.
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
- name: page_token
|
||||
in: query
|
||||
description: The token to get the next page of rows.
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
- name: filter_condition
|
||||
in: query
|
||||
description: >-
|
||||
(Optional) A condition to filter the rows by.
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
|
@ -613,7 +620,8 @@ paths:
|
|||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
description: >-
|
||||
EvaluateResponse object containing generations and scores
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
@ -630,10 +638,12 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Eval
|
||||
description: ''
|
||||
description: Evaluate a list of rows on a benchmark.
|
||||
parameters:
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the benchmark to run the evaluation on.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
@ -1417,7 +1427,7 @@ paths:
|
|||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
description: The status of the evaluationjob.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
@ -1436,15 +1446,18 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Eval
|
||||
description: ''
|
||||
description: Get the status of a job.
|
||||
parameters:
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the benchmark to run the evaluation on.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: job_id
|
||||
in: path
|
||||
description: The ID of the job to get the status of.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
@ -1464,15 +1477,18 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Eval
|
||||
description: ''
|
||||
description: Cancel a job.
|
||||
parameters:
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the benchmark to run the evaluation on.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: job_id
|
||||
in: path
|
||||
description: The ID of the job to cancel.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
@ -1480,7 +1496,7 @@ paths:
|
|||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
description: The result of the job.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
@ -1497,15 +1513,18 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Eval
|
||||
description: ''
|
||||
description: Get the result of a job.
|
||||
parameters:
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the benchmark to run the evaluation on.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: job_id
|
||||
in: path
|
||||
description: The ID of the job to get the result of.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
@ -2218,7 +2237,8 @@ paths:
|
|||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
description: >-
|
||||
The job that was created to run the evaluation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
@ -2235,10 +2255,12 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Eval
|
||||
description: ''
|
||||
description: Run an evaluation on a benchmark.
|
||||
parameters:
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the benchmark to run the evaluation on.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
@ -2306,7 +2328,8 @@ paths:
|
|||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
description: >-
|
||||
ScoreResponse object containing rows and aggregated results
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
@ -2323,7 +2346,7 @@ paths:
|
|||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Scoring
|
||||
description: ''
|
||||
description: Score a list of rows.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
|
@ -4290,11 +4313,14 @@ components:
|
|||
default: agent
|
||||
config:
|
||||
$ref: '#/components/schemas/AgentConfig'
|
||||
description: >-
|
||||
The configuration for the agent candidate.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- type
|
||||
- config
|
||||
title: AgentCandidate
|
||||
description: An agent candidate for evaluation.
|
||||
AggregationFunctionType:
|
||||
type: string
|
||||
enum:
|
||||
|
@ -4323,17 +4349,26 @@ components:
|
|||
properties:
|
||||
eval_candidate:
|
||||
$ref: '#/components/schemas/EvalCandidate'
|
||||
description: The candidate to evaluate.
|
||||
scoring_params:
|
||||
type: object
|
||||
additionalProperties:
|
||||
$ref: '#/components/schemas/ScoringFnParams'
|
||||
description: >-
|
||||
Map between scoring function id and parameters for each scoring function
|
||||
you want to run
|
||||
num_examples:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) The number of examples to evaluate. If not provided, all examples
|
||||
in the dataset will be evaluated
|
||||
additionalProperties: false
|
||||
required:
|
||||
- eval_candidate
|
||||
- scoring_params
|
||||
title: BenchmarkConfig
|
||||
description: >-
|
||||
A benchmark configuration for evaluation.
|
||||
EvalCandidate:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/ModelCandidate'
|
||||
|
@ -4376,16 +4411,22 @@ components:
|
|||
default: model
|
||||
model:
|
||||
type: string
|
||||
description: The model ID to evaluate.
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
description: The sampling parameters for the model.
|
||||
system_message:
|
||||
$ref: '#/components/schemas/SystemMessage'
|
||||
description: >-
|
||||
(Optional) The system message providing instructions or context to the
|
||||
model.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- type
|
||||
- model
|
||||
- sampling_params
|
||||
title: ModelCandidate
|
||||
description: A model candidate for evaluation.
|
||||
RegexParserScoringFnParams:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -4431,12 +4472,16 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows to evaluate.
|
||||
scoring_functions:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
The scoring functions to use for the evaluation.
|
||||
benchmark_config:
|
||||
$ref: '#/components/schemas/BenchmarkConfig'
|
||||
description: The configuration for the benchmark.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input_rows
|
||||
|
@ -4458,15 +4503,18 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The generations from the evaluation.
|
||||
scores:
|
||||
type: object
|
||||
additionalProperties:
|
||||
$ref: '#/components/schemas/ScoringResult'
|
||||
description: The scores from the evaluation.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- generations
|
||||
- scores
|
||||
title: EvaluateResponse
|
||||
description: The response from an evaluation.
|
||||
ScoringResult:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -4482,6 +4530,8 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: >-
|
||||
The scoring result for each row. Each row is a map of column name to value.
|
||||
aggregated_results:
|
||||
type: object
|
||||
additionalProperties:
|
||||
|
@ -4492,11 +4542,13 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: Map of metric name to aggregated value
|
||||
additionalProperties: false
|
||||
required:
|
||||
- score_rows
|
||||
- aggregated_results
|
||||
title: ScoringResult
|
||||
description: A scoring result for a single row.
|
||||
Session:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -4809,15 +4861,19 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows in the current page.
|
||||
total_count:
|
||||
type: integer
|
||||
description: The total number of rows in the dataset.
|
||||
next_page_token:
|
||||
type: string
|
||||
description: The token to get the next page of rows.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- rows
|
||||
- total_count
|
||||
title: PaginatedRowsResult
|
||||
description: A paginated list of rows from a dataset.
|
||||
ScoringFn:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -6248,6 +6304,7 @@ components:
|
|||
properties:
|
||||
benchmark_config:
|
||||
$ref: '#/components/schemas/BenchmarkConfig'
|
||||
description: The configuration for the benchmark.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- benchmark_config
|
||||
|
@ -6329,12 +6386,15 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows to score.
|
||||
scoring_functions:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/ScoringFnParams'
|
||||
- type: 'null'
|
||||
description: >-
|
||||
The scoring functions to use for the scoring.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input_rows
|
||||
|
@ -6347,10 +6407,13 @@ components:
|
|||
type: object
|
||||
additionalProperties:
|
||||
$ref: '#/components/schemas/ScoringResult'
|
||||
description: >-
|
||||
A map of scoring function name to ScoringResult.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- results
|
||||
title: ScoreResponse
|
||||
description: The response from scoring.
|
||||
ScoreBatchRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -6621,6 +6684,8 @@ tags:
|
|||
- name: DatasetIO
|
||||
- name: Datasets
|
||||
- name: Eval
|
||||
x-displayName: >-
|
||||
Llama Stack Evaluation API for running evaluations on model and agent candidates.
|
||||
- name: Files (Coming Soon)
|
||||
- name: Inference
|
||||
description: >-
|
||||
|
|
|
@ -14,6 +14,14 @@ from llama_stack.schema_utils import json_schema_type, webmethod
|
|||
|
||||
@json_schema_type
|
||||
class PaginatedRowsResult(BaseModel):
|
||||
"""
|
||||
A paginated list of rows from a dataset.
|
||||
|
||||
:param rows: The rows in the current page.
|
||||
:param total_count: The total number of rows in the dataset.
|
||||
:param next_page_token: The token to get the next page of rows.
|
||||
"""
|
||||
|
||||
# the rows obey the DatasetSchema for the given dataset
|
||||
rows: List[Dict[str, Any]]
|
||||
total_count: int
|
||||
|
@ -36,7 +44,15 @@ class DatasetIO(Protocol):
|
|||
rows_in_page: int,
|
||||
page_token: Optional[str] = None,
|
||||
filter_condition: Optional[str] = None,
|
||||
) -> PaginatedRowsResult: ...
|
||||
) -> PaginatedRowsResult:
|
||||
"""Get a paginated list of rows from a dataset.
|
||||
|
||||
:param dataset_id: The ID of the dataset to get the rows from.
|
||||
:param rows_in_page: The number of rows to get per page.
|
||||
:param page_token: The token to get the next page of rows.
|
||||
:param filter_condition: (Optional) A condition to filter the rows by.
|
||||
"""
|
||||
...
|
||||
|
||||
@webmethod(route="/datasetio/rows", method="POST")
|
||||
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
|
||||
|
|
|
@ -19,6 +19,13 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
|
|||
|
||||
@json_schema_type
|
||||
class ModelCandidate(BaseModel):
|
||||
"""A model candidate for evaluation.
|
||||
|
||||
:param model: The model ID to evaluate.
|
||||
:param sampling_params: The sampling parameters for the model.
|
||||
:param system_message: (Optional) The system message providing instructions or context to the model.
|
||||
"""
|
||||
|
||||
type: Literal["model"] = "model"
|
||||
model: str
|
||||
sampling_params: SamplingParams
|
||||
|
@ -27,6 +34,11 @@ class ModelCandidate(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class AgentCandidate(BaseModel):
|
||||
"""An agent candidate for evaluation.
|
||||
|
||||
:param config: The configuration for the agent candidate.
|
||||
"""
|
||||
|
||||
type: Literal["agent"] = "agent"
|
||||
config: AgentConfig
|
||||
|
||||
|
@ -39,6 +51,13 @@ EvalCandidate = register_schema(
|
|||
|
||||
@json_schema_type
|
||||
class BenchmarkConfig(BaseModel):
|
||||
"""A benchmark configuration for evaluation.
|
||||
|
||||
:param eval_candidate: The candidate to evaluate.
|
||||
:param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
|
||||
:param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
|
||||
"""
|
||||
|
||||
eval_candidate: EvalCandidate
|
||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||
|
@ -53,18 +72,32 @@ class BenchmarkConfig(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class EvaluateResponse(BaseModel):
|
||||
"""The response from an evaluation.
|
||||
|
||||
:param generations: The generations from the evaluation.
|
||||
:param scores: The scores from the evaluation.
|
||||
"""
|
||||
|
||||
generations: List[Dict[str, Any]]
|
||||
# each key in the dict is a scoring function name
|
||||
scores: Dict[str, ScoringResult]
|
||||
|
||||
|
||||
class Eval(Protocol):
|
||||
"""Llama Stack Evaluation API for running evaluations on model and agent candidates."""
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
|
||||
async def run_eval(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
benchmark_config: BenchmarkConfig,
|
||||
) -> Job: ...
|
||||
) -> Job:
|
||||
"""Run an evaluation on a benchmark.
|
||||
|
||||
:param benchmark_id: The ID of the benchmark to run the evaluation on.
|
||||
:param benchmark_config: The configuration for the benchmark.
|
||||
:return: The job that was created to run the evaluation.
|
||||
"""
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
|
||||
async def evaluate_rows(
|
||||
|
@ -73,13 +106,40 @@ class Eval(Protocol):
|
|||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
benchmark_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
) -> EvaluateResponse:
|
||||
"""Evaluate a list of rows on a benchmark.
|
||||
|
||||
:param benchmark_id: The ID of the benchmark to run the evaluation on.
|
||||
:param input_rows: The rows to evaluate.
|
||||
:param scoring_functions: The scoring functions to use for the evaluation.
|
||||
:param benchmark_config: The configuration for the benchmark.
|
||||
:return: EvaluateResponse object containing generations and scores
|
||||
"""
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
|
||||
"""Get the status of a job.
|
||||
|
||||
:param benchmark_id: The ID of the benchmark to run the evaluation on.
|
||||
:param job_id: The ID of the job to get the status of.
|
||||
:return: The status of the evaluationjob.
|
||||
"""
|
||||
...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||
"""Cancel a job.
|
||||
|
||||
:param benchmark_id: The ID of the benchmark to run the evaluation on.
|
||||
:param job_id: The ID of the job to cancel.
|
||||
"""
|
||||
...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||
"""Get the result of a job.
|
||||
|
||||
:param benchmark_id: The ID of the benchmark to run the evaluation on.
|
||||
:param job_id: The ID of the job to get the result of.
|
||||
:return: The result of the job.
|
||||
"""
|
||||
|
|
|
@ -17,6 +17,13 @@ ScoringResultRow = Dict[str, Any]
|
|||
|
||||
@json_schema_type
|
||||
class ScoringResult(BaseModel):
|
||||
"""
|
||||
A scoring result for a single row.
|
||||
|
||||
:param score_rows: The scoring result for each row. Each row is a map of column name to value.
|
||||
:param aggregated_results: Map of metric name to aggregated value
|
||||
"""
|
||||
|
||||
score_rows: List[ScoringResultRow]
|
||||
# aggregated metrics to value
|
||||
aggregated_results: Dict[str, Any]
|
||||
|
@ -30,6 +37,12 @@ class ScoreBatchResponse(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class ScoreResponse(BaseModel):
|
||||
"""
|
||||
The response from scoring.
|
||||
|
||||
:param results: A map of scoring function name to ScoringResult.
|
||||
"""
|
||||
|
||||
# each key in the dict is a scoring function name
|
||||
results: Dict[str, ScoringResult]
|
||||
|
||||
|
@ -55,4 +68,11 @@ class Scoring(Protocol):
|
|||
self,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: Dict[str, Optional[ScoringFnParams]],
|
||||
) -> ScoreResponse: ...
|
||||
) -> ScoreResponse:
|
||||
"""Score a list of rows.
|
||||
|
||||
:param input_rows: The rows to score.
|
||||
:param scoring_functions: The scoring functions to use for the scoring.
|
||||
:return: ScoreResponse object containing rows and aggregated results
|
||||
"""
|
||||
...
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue