eval/scoring/datasetio doc

This commit is contained in:
Xi Yan 2025-03-04 14:54:43 -08:00
parent c30cba9db2
commit 83d78cca9c
5 changed files with 262 additions and 60 deletions

View file

@ -69,11 +69,12 @@
"tags": [ "tags": [
"DatasetIO" "DatasetIO"
], ],
"description": "", "description": "Get a paginated list of rows from a dataset.",
"parameters": [ "parameters": [
{ {
"name": "dataset_id", "name": "dataset_id",
"in": "query", "in": "query",
"description": "The ID of the dataset to get the rows from.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -82,6 +83,7 @@
{ {
"name": "rows_in_page", "name": "rows_in_page",
"in": "query", "in": "query",
"description": "The number of rows to get per page.",
"required": true, "required": true,
"schema": { "schema": {
"type": "integer" "type": "integer"
@ -90,6 +92,7 @@
{ {
"name": "page_token", "name": "page_token",
"in": "query", "in": "query",
"description": "The token to get the next page of rows.",
"required": false, "required": false,
"schema": { "schema": {
"type": "string" "type": "string"
@ -98,6 +101,7 @@
{ {
"name": "filter_condition", "name": "filter_condition",
"in": "query", "in": "query",
"description": "(Optional) A condition to filter the rows by.",
"required": false, "required": false,
"schema": { "schema": {
"type": "string" "type": "string"
@ -896,7 +900,7 @@
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "EvaluateResponse object containing generations and scores",
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
@ -921,11 +925,12 @@
"tags": [ "tags": [
"Eval" "Eval"
], ],
"description": "", "description": "Evaluate a list of rows on a benchmark.",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "benchmark_id",
"in": "path", "in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2121,7 +2126,7 @@
"get": { "get": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "The status of the evaluationjob.",
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
@ -2153,11 +2158,12 @@
"tags": [ "tags": [
"Eval" "Eval"
], ],
"description": "", "description": "Get the status of a job.",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "benchmark_id",
"in": "path", "in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2166,6 +2172,7 @@
{ {
"name": "job_id", "name": "job_id",
"in": "path", "in": "path",
"description": "The ID of the job to get the status of.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2194,11 +2201,12 @@
"tags": [ "tags": [
"Eval" "Eval"
], ],
"description": "", "description": "Cancel a job.",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "benchmark_id",
"in": "path", "in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2207,6 +2215,7 @@
{ {
"name": "job_id", "name": "job_id",
"in": "path", "in": "path",
"description": "The ID of the job to cancel.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2219,7 +2228,7 @@
"get": { "get": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "The result of the job.",
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
@ -2244,11 +2253,12 @@
"tags": [ "tags": [
"Eval" "Eval"
], ],
"description": "", "description": "Get the result of a job.",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "benchmark_id",
"in": "path", "in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -2257,6 +2267,7 @@
{ {
"name": "job_id", "name": "job_id",
"in": "path", "in": "path",
"description": "The ID of the job to get the result of.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -3287,7 +3298,7 @@
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "The job that was created to run the evaluation.",
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
@ -3312,11 +3323,12 @@
"tags": [ "tags": [
"Eval" "Eval"
], ],
"description": "", "description": "Run an evaluation on a benchmark.",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "benchmark_id",
"in": "path", "in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -3418,7 +3430,7 @@
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "ScoreResponse object containing rows and aggregated results",
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
@ -3443,7 +3455,7 @@
"tags": [ "tags": [
"Scoring" "Scoring"
], ],
"description": "", "description": "Score a list of rows.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -6187,7 +6199,8 @@
"default": "agent" "default": "agent"
}, },
"config": { "config": {
"$ref": "#/components/schemas/AgentConfig" "$ref": "#/components/schemas/AgentConfig",
"description": "The configuration for the agent candidate."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6195,7 +6208,8 @@
"type", "type",
"config" "config"
], ],
"title": "AgentCandidate" "title": "AgentCandidate",
"description": "An agent candidate for evaluation."
}, },
"AggregationFunctionType": { "AggregationFunctionType": {
"type": "string", "type": "string",
@ -6232,16 +6246,19 @@
"type": "object", "type": "object",
"properties": { "properties": {
"eval_candidate": { "eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate" "$ref": "#/components/schemas/EvalCandidate",
"description": "The candidate to evaluate."
}, },
"scoring_params": { "scoring_params": {
"type": "object", "type": "object",
"additionalProperties": { "additionalProperties": {
"$ref": "#/components/schemas/ScoringFnParams" "$ref": "#/components/schemas/ScoringFnParams"
} },
"description": "Map between scoring function id and parameters for each scoring function you want to run"
}, },
"num_examples": { "num_examples": {
"type": "integer" "type": "integer",
"description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6249,7 +6266,8 @@
"eval_candidate", "eval_candidate",
"scoring_params" "scoring_params"
], ],
"title": "BenchmarkConfig" "title": "BenchmarkConfig",
"description": "A benchmark configuration for evaluation."
}, },
"EvalCandidate": { "EvalCandidate": {
"oneOf": [ "oneOf": [
@ -6311,13 +6329,16 @@
"default": "model" "default": "model"
}, },
"model": { "model": {
"type": "string" "type": "string",
"description": "The model ID to evaluate."
}, },
"sampling_params": { "sampling_params": {
"$ref": "#/components/schemas/SamplingParams" "$ref": "#/components/schemas/SamplingParams",
"description": "The sampling parameters for the model."
}, },
"system_message": { "system_message": {
"$ref": "#/components/schemas/SystemMessage" "$ref": "#/components/schemas/SystemMessage",
"description": "(Optional) The system message providing instructions or context to the model."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6326,7 +6347,8 @@
"model", "model",
"sampling_params" "sampling_params"
], ],
"title": "ModelCandidate" "title": "ModelCandidate",
"description": "A model candidate for evaluation."
}, },
"RegexParserScoringFnParams": { "RegexParserScoringFnParams": {
"type": "object", "type": "object",
@ -6405,16 +6427,19 @@
} }
] ]
} }
} },
"description": "The rows to evaluate."
}, },
"scoring_functions": { "scoring_functions": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
} },
"description": "The scoring functions to use for the evaluation."
}, },
"benchmark_config": { "benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig" "$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6454,13 +6479,15 @@
} }
] ]
} }
} },
"description": "The generations from the evaluation."
}, },
"scores": { "scores": {
"type": "object", "type": "object",
"additionalProperties": { "additionalProperties": {
"$ref": "#/components/schemas/ScoringResult" "$ref": "#/components/schemas/ScoringResult"
} },
"description": "The scores from the evaluation."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6468,7 +6495,8 @@
"generations", "generations",
"scores" "scores"
], ],
"title": "EvaluateResponse" "title": "EvaluateResponse",
"description": "The response from an evaluation."
}, },
"ScoringResult": { "ScoringResult": {
"type": "object", "type": "object",
@ -6499,7 +6527,8 @@
} }
] ]
} }
} },
"description": "The scoring result for each row. Each row is a map of column name to value."
}, },
"aggregated_results": { "aggregated_results": {
"type": "object", "type": "object",
@ -6524,7 +6553,8 @@
"type": "object" "type": "object"
} }
] ]
} },
"description": "Map of metric name to aggregated value"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6532,7 +6562,8 @@
"score_rows", "score_rows",
"aggregated_results" "aggregated_results"
], ],
"title": "ScoringResult" "title": "ScoringResult",
"description": "A scoring result for a single row."
}, },
"Session": { "Session": {
"type": "object", "type": "object",
@ -7021,13 +7052,16 @@
} }
] ]
} }
} },
"description": "The rows in the current page."
}, },
"total_count": { "total_count": {
"type": "integer" "type": "integer",
"description": "The total number of rows in the dataset."
}, },
"next_page_token": { "next_page_token": {
"type": "string" "type": "string",
"description": "The token to get the next page of rows."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -7035,7 +7069,8 @@
"rows", "rows",
"total_count" "total_count"
], ],
"title": "PaginatedRowsResult" "title": "PaginatedRowsResult",
"description": "A paginated list of rows from a dataset."
}, },
"ScoringFn": { "ScoringFn": {
"type": "object", "type": "object",
@ -9307,7 +9342,8 @@
"type": "object", "type": "object",
"properties": { "properties": {
"benchmark_config": { "benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig" "$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -9444,7 +9480,8 @@
} }
] ]
} }
} },
"description": "The rows to score."
}, },
"scoring_functions": { "scoring_functions": {
"type": "object", "type": "object",
@ -9457,7 +9494,8 @@
"type": "null" "type": "null"
} }
] ]
} },
"description": "The scoring functions to use for the scoring."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -9474,14 +9512,16 @@
"type": "object", "type": "object",
"additionalProperties": { "additionalProperties": {
"$ref": "#/components/schemas/ScoringResult" "$ref": "#/components/schemas/ScoringResult"
} },
"description": "A map of scoring function name to ScoringResult."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"results" "results"
], ],
"title": "ScoreResponse" "title": "ScoreResponse",
"description": "The response from scoring."
}, },
"ScoreBatchRequest": { "ScoreBatchRequest": {
"type": "object", "type": "object",
@ -9896,7 +9936,8 @@
"name": "Datasets" "name": "Datasets"
}, },
{ {
"name": "Eval" "name": "Eval",
"x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
}, },
{ {
"name": "Files (Coming Soon)" "name": "Files (Coming Soon)"

View file

@ -31,25 +31,32 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- DatasetIO - DatasetIO
description: '' description: >-
Get a paginated list of rows from a dataset.
parameters: parameters:
- name: dataset_id - name: dataset_id
in: query in: query
description: >-
The ID of the dataset to get the rows from.
required: true required: true
schema: schema:
type: string type: string
- name: rows_in_page - name: rows_in_page
in: query in: query
description: The number of rows to get per page.
required: true required: true
schema: schema:
type: integer type: integer
- name: page_token - name: page_token
in: query in: query
description: The token to get the next page of rows.
required: false required: false
schema: schema:
type: string type: string
- name: filter_condition - name: filter_condition
in: query in: query
description: >-
(Optional) A condition to filter the rows by.
required: false required: false
schema: schema:
type: string type: string
@ -613,7 +620,8 @@ paths:
post: post:
responses: responses:
'200': '200':
description: OK description: >-
EvaluateResponse object containing generations and scores
content: content:
application/json: application/json:
schema: schema:
@ -630,10 +638,12 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Eval - Eval
description: '' description: Evaluate a list of rows on a benchmark.
parameters: parameters:
- name: benchmark_id - name: benchmark_id
in: path in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true required: true
schema: schema:
type: string type: string
@ -1417,7 +1427,7 @@ paths:
get: get:
responses: responses:
'200': '200':
description: OK description: The status of the evaluationjob.
content: content:
application/json: application/json:
schema: schema:
@ -1436,15 +1446,18 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Eval - Eval
description: '' description: Get the status of a job.
parameters: parameters:
- name: benchmark_id - name: benchmark_id
in: path in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true required: true
schema: schema:
type: string type: string
- name: job_id - name: job_id
in: path in: path
description: The ID of the job to get the status of.
required: true required: true
schema: schema:
type: string type: string
@ -1464,15 +1477,18 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Eval - Eval
description: '' description: Cancel a job.
parameters: parameters:
- name: benchmark_id - name: benchmark_id
in: path in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true required: true
schema: schema:
type: string type: string
- name: job_id - name: job_id
in: path in: path
description: The ID of the job to cancel.
required: true required: true
schema: schema:
type: string type: string
@ -1480,7 +1496,7 @@ paths:
get: get:
responses: responses:
'200': '200':
description: OK description: The result of the job.
content: content:
application/json: application/json:
schema: schema:
@ -1497,15 +1513,18 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Eval - Eval
description: '' description: Get the result of a job.
parameters: parameters:
- name: benchmark_id - name: benchmark_id
in: path in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true required: true
schema: schema:
type: string type: string
- name: job_id - name: job_id
in: path in: path
description: The ID of the job to get the result of.
required: true required: true
schema: schema:
type: string type: string
@ -2218,7 +2237,8 @@ paths:
post: post:
responses: responses:
'200': '200':
description: OK description: >-
The job that was created to run the evaluation.
content: content:
application/json: application/json:
schema: schema:
@ -2235,10 +2255,12 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Eval - Eval
description: '' description: Run an evaluation on a benchmark.
parameters: parameters:
- name: benchmark_id - name: benchmark_id
in: path in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true required: true
schema: schema:
type: string type: string
@ -2306,7 +2328,8 @@ paths:
post: post:
responses: responses:
'200': '200':
description: OK description: >-
ScoreResponse object containing rows and aggregated results
content: content:
application/json: application/json:
schema: schema:
@ -2323,7 +2346,7 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Scoring - Scoring
description: '' description: Score a list of rows.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -4290,11 +4313,14 @@ components:
default: agent default: agent
config: config:
$ref: '#/components/schemas/AgentConfig' $ref: '#/components/schemas/AgentConfig'
description: >-
The configuration for the agent candidate.
additionalProperties: false additionalProperties: false
required: required:
- type - type
- config - config
title: AgentCandidate title: AgentCandidate
description: An agent candidate for evaluation.
AggregationFunctionType: AggregationFunctionType:
type: string type: string
enum: enum:
@ -4323,17 +4349,26 @@ components:
properties: properties:
eval_candidate: eval_candidate:
$ref: '#/components/schemas/EvalCandidate' $ref: '#/components/schemas/EvalCandidate'
description: The candidate to evaluate.
scoring_params: scoring_params:
type: object type: object
additionalProperties: additionalProperties:
$ref: '#/components/schemas/ScoringFnParams' $ref: '#/components/schemas/ScoringFnParams'
description: >-
Map between scoring function id and parameters for each scoring function
you want to run
num_examples: num_examples:
type: integer type: integer
description: >-
(Optional) The number of examples to evaluate. If not provided, all examples
in the dataset will be evaluated
additionalProperties: false additionalProperties: false
required: required:
- eval_candidate - eval_candidate
- scoring_params - scoring_params
title: BenchmarkConfig title: BenchmarkConfig
description: >-
A benchmark configuration for evaluation.
EvalCandidate: EvalCandidate:
oneOf: oneOf:
- $ref: '#/components/schemas/ModelCandidate' - $ref: '#/components/schemas/ModelCandidate'
@ -4376,16 +4411,22 @@ components:
default: model default: model
model: model:
type: string type: string
description: The model ID to evaluate.
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
description: The sampling parameters for the model.
system_message: system_message:
$ref: '#/components/schemas/SystemMessage' $ref: '#/components/schemas/SystemMessage'
description: >-
(Optional) The system message providing instructions or context to the
model.
additionalProperties: false additionalProperties: false
required: required:
- type - type
- model - model
- sampling_params - sampling_params
title: ModelCandidate title: ModelCandidate
description: A model candidate for evaluation.
RegexParserScoringFnParams: RegexParserScoringFnParams:
type: object type: object
properties: properties:
@ -4431,12 +4472,16 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The rows to evaluate.
scoring_functions: scoring_functions:
type: array type: array
items: items:
type: string type: string
description: >-
The scoring functions to use for the evaluation.
benchmark_config: benchmark_config:
$ref: '#/components/schemas/BenchmarkConfig' $ref: '#/components/schemas/BenchmarkConfig'
description: The configuration for the benchmark.
additionalProperties: false additionalProperties: false
required: required:
- input_rows - input_rows
@ -4458,15 +4503,18 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The generations from the evaluation.
scores: scores:
type: object type: object
additionalProperties: additionalProperties:
$ref: '#/components/schemas/ScoringResult' $ref: '#/components/schemas/ScoringResult'
description: The scores from the evaluation.
additionalProperties: false additionalProperties: false
required: required:
- generations - generations
- scores - scores
title: EvaluateResponse title: EvaluateResponse
description: The response from an evaluation.
ScoringResult: ScoringResult:
type: object type: object
properties: properties:
@ -4482,6 +4530,8 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: >-
The scoring result for each row. Each row is a map of column name to value.
aggregated_results: aggregated_results:
type: object type: object
additionalProperties: additionalProperties:
@ -4492,11 +4542,13 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: Map of metric name to aggregated value
additionalProperties: false additionalProperties: false
required: required:
- score_rows - score_rows
- aggregated_results - aggregated_results
title: ScoringResult title: ScoringResult
description: A scoring result for a single row.
Session: Session:
type: object type: object
properties: properties:
@ -4809,15 +4861,19 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The rows in the current page.
total_count: total_count:
type: integer type: integer
description: The total number of rows in the dataset.
next_page_token: next_page_token:
type: string type: string
description: The token to get the next page of rows.
additionalProperties: false additionalProperties: false
required: required:
- rows - rows
- total_count - total_count
title: PaginatedRowsResult title: PaginatedRowsResult
description: A paginated list of rows from a dataset.
ScoringFn: ScoringFn:
type: object type: object
properties: properties:
@ -6248,6 +6304,7 @@ components:
properties: properties:
benchmark_config: benchmark_config:
$ref: '#/components/schemas/BenchmarkConfig' $ref: '#/components/schemas/BenchmarkConfig'
description: The configuration for the benchmark.
additionalProperties: false additionalProperties: false
required: required:
- benchmark_config - benchmark_config
@ -6329,12 +6386,15 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The rows to score.
scoring_functions: scoring_functions:
type: object type: object
additionalProperties: additionalProperties:
oneOf: oneOf:
- $ref: '#/components/schemas/ScoringFnParams' - $ref: '#/components/schemas/ScoringFnParams'
- type: 'null' - type: 'null'
description: >-
The scoring functions to use for the scoring.
additionalProperties: false additionalProperties: false
required: required:
- input_rows - input_rows
@ -6347,10 +6407,13 @@ components:
type: object type: object
additionalProperties: additionalProperties:
$ref: '#/components/schemas/ScoringResult' $ref: '#/components/schemas/ScoringResult'
description: >-
A map of scoring function name to ScoringResult.
additionalProperties: false additionalProperties: false
required: required:
- results - results
title: ScoreResponse title: ScoreResponse
description: The response from scoring.
ScoreBatchRequest: ScoreBatchRequest:
type: object type: object
properties: properties:
@ -6621,6 +6684,8 @@ tags:
- name: DatasetIO - name: DatasetIO
- name: Datasets - name: Datasets
- name: Eval - name: Eval
x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Files (Coming Soon) - name: Files (Coming Soon)
- name: Inference - name: Inference
description: >- description: >-

View file

@ -14,6 +14,14 @@ from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type @json_schema_type
class PaginatedRowsResult(BaseModel): class PaginatedRowsResult(BaseModel):
"""
A paginated list of rows from a dataset.
:param rows: The rows in the current page.
:param total_count: The total number of rows in the dataset.
:param next_page_token: The token to get the next page of rows.
"""
# the rows obey the DatasetSchema for the given dataset # the rows obey the DatasetSchema for the given dataset
rows: List[Dict[str, Any]] rows: List[Dict[str, Any]]
total_count: int total_count: int
@ -36,7 +44,15 @@ class DatasetIO(Protocol):
rows_in_page: int, rows_in_page: int,
page_token: Optional[str] = None, page_token: Optional[str] = None,
filter_condition: Optional[str] = None, filter_condition: Optional[str] = None,
) -> PaginatedRowsResult: ... ) -> PaginatedRowsResult:
"""Get a paginated list of rows from a dataset.
:param dataset_id: The ID of the dataset to get the rows from.
:param rows_in_page: The number of rows to get per page.
:param page_token: The token to get the next page of rows.
:param filter_condition: (Optional) A condition to filter the rows by.
"""
...
@webmethod(route="/datasetio/rows", method="POST") @webmethod(route="/datasetio/rows", method="POST")
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ... async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...

View file

@ -19,6 +19,13 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
@json_schema_type @json_schema_type
class ModelCandidate(BaseModel): class ModelCandidate(BaseModel):
"""A model candidate for evaluation.
:param model: The model ID to evaluate.
:param sampling_params: The sampling parameters for the model.
:param system_message: (Optional) The system message providing instructions or context to the model.
"""
type: Literal["model"] = "model" type: Literal["model"] = "model"
model: str model: str
sampling_params: SamplingParams sampling_params: SamplingParams
@ -27,6 +34,11 @@ class ModelCandidate(BaseModel):
@json_schema_type @json_schema_type
class AgentCandidate(BaseModel): class AgentCandidate(BaseModel):
"""An agent candidate for evaluation.
:param config: The configuration for the agent candidate.
"""
type: Literal["agent"] = "agent" type: Literal["agent"] = "agent"
config: AgentConfig config: AgentConfig
@ -39,6 +51,13 @@ EvalCandidate = register_schema(
@json_schema_type @json_schema_type
class BenchmarkConfig(BaseModel): class BenchmarkConfig(BaseModel):
"""A benchmark configuration for evaluation.
:param eval_candidate: The candidate to evaluate.
:param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
:param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
"""
eval_candidate: EvalCandidate eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field( scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run", description="Map between scoring function id and parameters for each scoring function you want to run",
@ -53,18 +72,32 @@ class BenchmarkConfig(BaseModel):
@json_schema_type @json_schema_type
class EvaluateResponse(BaseModel): class EvaluateResponse(BaseModel):
"""The response from an evaluation.
:param generations: The generations from the evaluation.
:param scores: The scores from the evaluation.
"""
generations: List[Dict[str, Any]] generations: List[Dict[str, Any]]
# each key in the dict is a scoring function name # each key in the dict is a scoring function name
scores: Dict[str, ScoringResult] scores: Dict[str, ScoringResult]
class Eval(Protocol): class Eval(Protocol):
"""Llama Stack Evaluation API for running evaluations on model and agent candidates."""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
async def run_eval( async def run_eval(
self, self,
benchmark_id: str, benchmark_id: str,
benchmark_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> Job: ... ) -> Job:
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param benchmark_config: The configuration for the benchmark.
:return: The job that was created to run the evaluation.
"""
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
async def evaluate_rows( async def evaluate_rows(
@ -73,13 +106,40 @@ class Eval(Protocol):
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: List[str], scoring_functions: List[str],
benchmark_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> EvaluateResponse: ... ) -> EvaluateResponse:
"""Evaluate a list of rows on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param input_rows: The rows to evaluate.
:param scoring_functions: The scoring functions to use for the evaluation.
:param benchmark_config: The configuration for the benchmark.
:return: EvaluateResponse object containing generations and scores
"""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
"""Get the status of a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the status of.
:return: The status of the evaluationjob.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
"""Cancel a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to cancel.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
"""Get the result of a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the result of.
:return: The result of the job.
"""

View file

@ -17,6 +17,13 @@ ScoringResultRow = Dict[str, Any]
@json_schema_type @json_schema_type
class ScoringResult(BaseModel): class ScoringResult(BaseModel):
"""
A scoring result for a single row.
:param score_rows: The scoring result for each row. Each row is a map of column name to value.
:param aggregated_results: Map of metric name to aggregated value
"""
score_rows: List[ScoringResultRow] score_rows: List[ScoringResultRow]
# aggregated metrics to value # aggregated metrics to value
aggregated_results: Dict[str, Any] aggregated_results: Dict[str, Any]
@ -30,6 +37,12 @@ class ScoreBatchResponse(BaseModel):
@json_schema_type @json_schema_type
class ScoreResponse(BaseModel): class ScoreResponse(BaseModel):
"""
The response from scoring.
:param results: A map of scoring function name to ScoringResult.
"""
# each key in the dict is a scoring function name # each key in the dict is a scoring function name
results: Dict[str, ScoringResult] results: Dict[str, ScoringResult]
@ -55,4 +68,11 @@ class Scoring(Protocol):
self, self,
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: Dict[str, Optional[ScoringFnParams]], scoring_functions: Dict[str, Optional[ScoringFnParams]],
) -> ScoreResponse: ... ) -> ScoreResponse:
"""Score a list of rows.
:param input_rows: The rows to score.
:param scoring_functions: The scoring functions to use for the scoring.
:return: ScoreResponse object containing rows and aggregated results
"""
...