scoring job

This commit is contained in:
Xi Yan 2025-03-12 01:16:37 -07:00
parent f88755eb93
commit 83d8777f56
2 changed files with 729 additions and 762 deletions

View file

@ -230,6 +230,108 @@
}
}
},
"/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
"description": "EvalJob object indicating its status",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/EvalJob"
},
{
"type": "null"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Scoring"
],
"description": "Get the EvalJob object for a given job id and benchmark id.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"description": "The ID of the job to get the status of.",
"required": true,
"schema": {
"type": "string"
}
}
]
},
"delete": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Scoring"
],
"description": "Cancel a job.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"description": "The ID of the job to cancel.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/post-training/job/cancel": {
"post": {
"responses": {
@ -968,7 +1070,60 @@
}
}
},
"/v1/eval/benchmarks/{benchmark_id}/evaluations": {
"/v1/eval/benchmark/{benchmark_id}/jobs": {
"post": {
"responses": {
"200": {
"description": "The job that was created to run the evaluation.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvalJob"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Eval"
],
"description": "Run an evaluation on a benchmark.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvaluateBenchmarkRequest"
}
}
},
"required": true
}
}
},
"/v1/eval/rows": {
"post": {
"responses": {
"200": {
@ -997,18 +1152,8 @@
"tags": [
"Eval"
],
"description": "Evaluate a list of rows on a benchmark.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
}
],
"description": "Evaluate a list of rows on a candidate.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
@ -2194,160 +2339,6 @@
}
}
},
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
"description": "The status of the evaluationjob.",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/JobStatus"
},
{
"type": "null"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Eval"
],
"description": "Get the status of a job.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"description": "The ID of the job to get the status of.",
"required": true,
"schema": {
"type": "string"
}
}
]
},
"delete": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Eval"
],
"description": "Cancel a job.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"description": "The ID of the job to cancel.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
"get": {
"responses": {
"200": {
"description": "The result of the job.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvaluateResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Eval"
],
"description": "Get the result of a job.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"description": "The ID of the job to get the result of.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/agents/{agent_id}/sessions": {
"get": {
"responses": {
@ -3430,59 +3421,6 @@
}
}
},
"/v1/eval/benchmarks/{benchmark_id}/jobs": {
"post": {
"responses": {
"200": {
"description": "The job that was created to run the evaluation.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Job"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Eval"
],
"description": "Run an evaluation on a benchmark.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"description": "The ID of the benchmark to run the evaluation on.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RunEvalRequest"
}
}
},
"required": true
}
}
},
"/v1/safety/run-shield": {
"post": {
"responses": {
@ -3562,7 +3500,50 @@
}
}
},
"/v1/scoring/score": {
"/v1/scoring/jobs": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScoringJob"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Scoring"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScoreDatasetRequest"
}
}
},
"required": true
}
}
},
"/v1/scoring/rows": {
"post": {
"responses": {
"200": {
@ -3597,50 +3578,7 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScoreRequest"
}
}
},
"required": true
}
}
},
"/v1/scoring/score-batch": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScoreBatchResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Scoring"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScoreBatchRequest"
"$ref": "#/components/schemas/ScoreRowsRequest"
}
}
},
@ -6347,6 +6285,122 @@
"title": "AgentCandidate",
"description": "An agent candidate for evaluation."
},
"EvalCandidate": {
"oneOf": [
{
"$ref": "#/components/schemas/ModelCandidate"
},
{
"$ref": "#/components/schemas/AgentCandidate"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"model": "#/components/schemas/ModelCandidate",
"agent": "#/components/schemas/AgentCandidate"
}
}
},
"ModelCandidate": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "model",
"default": "model"
},
"model": {
"type": "string",
"description": "The model ID to evaluate."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "The sampling parameters for the model."
},
"system_message": {
"$ref": "#/components/schemas/SystemMessage",
"description": "(Optional) The system message providing instructions or context to the model."
}
},
"additionalProperties": false,
"required": [
"type",
"model",
"sampling_params"
],
"title": "ModelCandidate",
"description": "A model candidate for evaluation."
},
"EvaluateBenchmarkRequest": {
"type": "object",
"properties": {
"candidate": {
"$ref": "#/components/schemas/EvalCandidate",
"description": "Candidate to evaluate on. - { \"type\": \"model\", \"model\": \"Llama-3.1-8B-Instruct\", \"sampling_params\": {...}, \"system_message\": \"You are a helpful assistant.\", } - { \"type\": \"agent\", \"config\": {...}, }"
}
},
"additionalProperties": false,
"required": [
"candidate"
],
"title": "EvaluateBenchmarkRequest"
},
"EvalJob": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The ID of the job."
},
"status": {
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled",
"cancelled"
],
"description": "The status of the job."
},
"created_at": {
"type": "string",
"format": "date-time",
"description": "The time the job was created."
},
"finished_at": {
"type": "string",
"format": "date-time",
"description": "The time the job finished."
},
"error": {
"type": "string",
"description": "If status of the job is failed, this will contain the error message."
},
"type": {
"type": "string",
"const": "eval",
"default": "eval"
},
"result_files": {
"type": "array",
"items": {
"type": "string"
}
}
},
"additionalProperties": false,
"required": [
"id",
"status",
"created_at",
"type",
"result_files"
],
"title": "EvalJob",
"description": "The EvalJob object representing a evaluation job that was created through API."
},
"AggregationFunctionType": {
"type": "string",
"enum": [
@ -6424,33 +6478,6 @@
],
"title": "AnswerSimilarityScoringFnParams"
},
"BenchmarkConfig": {
"type": "object",
"properties": {
"eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate",
"description": "The candidate to evaluate."
},
"scoring_params": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringFnParams"
},
"description": "Map between scoring function id and parameters for each scoring function you want to run"
},
"num_examples": {
"type": "integer",
"description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
}
},
"additionalProperties": false,
"required": [
"eval_candidate",
"scoring_params"
],
"title": "BenchmarkConfig",
"description": "A benchmark configuration for evaluation."
},
"ContextEntityRecallScoringFnParams": {
"type": "object",
"properties": {
@ -6561,23 +6588,6 @@
],
"title": "EqualityScoringFnParams"
},
"EvalCandidate": {
"oneOf": [
{
"$ref": "#/components/schemas/ModelCandidate"
},
{
"$ref": "#/components/schemas/AgentCandidate"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"model": "#/components/schemas/ModelCandidate",
"agent": "#/components/schemas/AgentCandidate"
}
}
},
"FactualityScoringFnParams": {
"type": "object",
"properties": {
@ -6656,36 +6666,6 @@
],
"title": "LLMAsJudgeScoringFnParams"
},
"ModelCandidate": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "model",
"default": "model"
},
"model": {
"type": "string",
"description": "The model ID to evaluate."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "The sampling parameters for the model."
},
"system_message": {
"$ref": "#/components/schemas/SystemMessage",
"description": "(Optional) The system message providing instructions or context to the model."
}
},
"additionalProperties": false,
"required": [
"type",
"model",
"sampling_params"
],
"title": "ModelCandidate",
"description": "A model candidate for evaluation."
},
"RegexParserMathScoringFnParams": {
"type": "object",
"properties": {
@ -6836,7 +6816,7 @@
"EvaluateRowsRequest": {
"type": "object",
"properties": {
"input_rows": {
"dataset_rows": {
"type": "array",
"items": {
"type": "object",
@ -6868,20 +6848,20 @@
"scoring_functions": {
"type": "array",
"items": {
"type": "string"
"$ref": "#/components/schemas/ScoringFnParams"
},
"description": "The scoring functions to use for the evaluation."
},
"benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
"candidate": {
"$ref": "#/components/schemas/EvalCandidate",
"description": "The candidate to evaluate on."
}
},
"additionalProperties": false,
"required": [
"input_rows",
"dataset_rows",
"scoring_functions",
"benchmark_config"
"candidate"
],
"title": "EvaluateRowsRequest"
},
@ -7941,16 +7921,6 @@
"title": "PostTrainingJobArtifactsResponse",
"description": "Artifacts of a finetuning job."
},
"JobStatus": {
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled"
],
"title": "JobStatus"
},
"PostTrainingJobStatusResponse": {
"type": "object",
"properties": {
@ -7958,7 +7928,15 @@
"type": "string"
},
"status": {
"$ref": "#/components/schemas/JobStatus"
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled",
"cancelled"
],
"title": "JobStatus"
},
"scheduled_at": {
"type": "string",
@ -9796,33 +9774,6 @@
],
"title": "ResumeAgentTurnRequest"
},
"RunEvalRequest": {
"type": "object",
"properties": {
"benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
}
},
"additionalProperties": false,
"required": [
"benchmark_config"
],
"title": "RunEvalRequest"
},
"Job": {
"type": "object",
"properties": {
"job_id": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"job_id"
],
"title": "Job"
},
"RunShieldRequest": {
"type": "object",
"properties": {
@ -9909,7 +9860,82 @@
],
"title": "SaveSpansToDatasetRequest"
},
"ScoreRequest": {
"ScoreDatasetRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"scoring_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ScoringFnParams"
}
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"scoring_functions"
],
"title": "ScoreDatasetRequest"
},
"ScoringJob": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The ID of the job."
},
"status": {
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled",
"cancelled"
],
"description": "The status of the job."
},
"created_at": {
"type": "string",
"format": "date-time",
"description": "The time the job was created."
},
"finished_at": {
"type": "string",
"format": "date-time",
"description": "The time the job finished."
},
"error": {
"type": "string",
"description": "If status of the job is failed, this will contain the error message."
},
"type": {
"type": "string",
"const": "scoring",
"default": "scoring"
},
"result_files": {
"type": "array",
"items": {
"type": "string"
}
}
},
"additionalProperties": false,
"required": [
"id",
"status",
"created_at",
"type",
"result_files"
],
"title": "ScoringJob",
"description": "The ScoringJob object representing a scoring job that was created through API."
},
"ScoreRowsRequest": {
"type": "object",
"properties": {
"input_rows": {
@ -9942,16 +9968,9 @@
"description": "The rows to score."
},
"scoring_functions": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"$ref": "#/components/schemas/ScoringFnParams"
},
{
"type": "null"
}
]
"type": "array",
"items": {
"$ref": "#/components/schemas/ScoringFnParams"
},
"description": "The scoring functions to use for the scoring."
}
@ -9961,7 +9980,7 @@
"input_rows",
"scoring_functions"
],
"title": "ScoreRequest"
"title": "ScoreRowsRequest"
},
"ScoreResponse": {
"type": "object",
@ -9981,56 +10000,6 @@
"title": "ScoreResponse",
"description": "The response from scoring."
},
"ScoreBatchRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"scoring_functions": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"$ref": "#/components/schemas/ScoringFnParams"
},
{
"type": "null"
}
]
}
},
"save_results_dataset": {
"type": "boolean"
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"scoring_functions",
"save_results_dataset"
],
"title": "ScoreBatchRequest"
},
"ScoreBatchResponse": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"results": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringResult"
}
}
},
"additionalProperties": false,
"required": [
"results"
],
"title": "ScoreBatchResponse"
},
"AlgorithmConfig": {
"oneOf": [
{

View file

@ -142,6 +142,76 @@ paths:
schema:
$ref: '#/components/schemas/BatchCompletionRequest'
required: true
/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
description: EvalJob object indicating its status
content:
application/json:
schema:
oneOf:
- $ref: '#/components/schemas/EvalJob'
- type: 'null'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Scoring
description: >-
Get the EvalJob object for a given job id and benchmark id.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to get the status of.
required: true
schema:
type: string
delete:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Scoring
description: Cancel a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to cancel.
required: true
schema:
type: string
/v1/post-training/job/cancel:
post:
responses:
@ -666,7 +736,44 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
/v1/eval/benchmarks/{benchmark_id}/evaluations:
/v1/eval/benchmark/{benchmark_id}/jobs:
post:
responses:
'200':
description: >-
The job that was created to run the evaluation.
content:
application/json:
schema:
$ref: '#/components/schemas/EvalJob'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Run an evaluation on a benchmark.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateBenchmarkRequest'
required: true
/v1/eval/rows:
post:
responses:
'200':
@ -688,15 +795,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Evaluate a list of rows on a benchmark.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
description: Evaluate a list of rows on a candidate.
parameters: []
requestBody:
content:
application/json:
@ -1473,111 +1573,6 @@ paths:
schema:
$ref: '#/components/schemas/InvokeToolRequest'
required: true
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
description: The status of the evaluationjob.
content:
application/json:
schema:
oneOf:
- $ref: '#/components/schemas/JobStatus'
- type: 'null'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Get the status of a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to get the status of.
required: true
schema:
type: string
delete:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Cancel a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to cancel.
required: true
schema:
type: string
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
get:
responses:
'200':
description: The result of the job.
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Get the result of a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to get the result of.
required: true
schema:
type: string
/v1/agents/{agent_id}/sessions:
get:
responses:
@ -2327,43 +2322,6 @@ paths:
schema:
$ref: '#/components/schemas/ResumeAgentTurnRequest'
required: true
/v1/eval/benchmarks/{benchmark_id}/jobs:
post:
responses:
'200':
description: >-
The job that was created to run the evaluation.
content:
application/json:
schema:
$ref: '#/components/schemas/Job'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Run an evaluation on a benchmark.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RunEvalRequest'
required: true
/v1/safety/run-shield:
post:
responses:
@ -2418,7 +2376,36 @@ paths:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
/v1/scoring/score:
/v1/scoring/jobs:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/ScoringJob'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Scoring
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/ScoreDatasetRequest'
required: true
/v1/scoring/rows:
post:
responses:
'200':
@ -2446,36 +2433,7 @@ paths:
content:
application/json:
schema:
$ref: '#/components/schemas/ScoreRequest'
required: true
/v1/scoring/score-batch:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/ScoreBatchResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Scoring
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/ScoreBatchRequest'
$ref: '#/components/schemas/ScoreRowsRequest'
required: true
/v1/post-training/supervised-fine-tune:
post:
@ -4415,6 +4373,99 @@ components:
- config
title: AgentCandidate
description: An agent candidate for evaluation.
EvalCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
- $ref: '#/components/schemas/AgentCandidate'
discriminator:
propertyName: type
mapping:
model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate'
ModelCandidate:
type: object
properties:
type:
type: string
const: model
default: model
model:
type: string
description: The model ID to evaluate.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: The sampling parameters for the model.
system_message:
$ref: '#/components/schemas/SystemMessage'
description: >-
(Optional) The system message providing instructions or context to the
model.
additionalProperties: false
required:
- type
- model
- sampling_params
title: ModelCandidate
description: A model candidate for evaluation.
EvaluateBenchmarkRequest:
type: object
properties:
candidate:
$ref: '#/components/schemas/EvalCandidate'
description: >-
Candidate to evaluate on. - { "type": "model", "model": "Llama-3.1-8B-Instruct",
"sampling_params": {...}, "system_message": "You are a helpful assistant.",
} - { "type": "agent", "config": {...}, }
additionalProperties: false
required:
- candidate
title: EvaluateBenchmarkRequest
EvalJob:
type: object
properties:
id:
type: string
description: The ID of the job.
status:
type: string
enum:
- completed
- in_progress
- failed
- scheduled
- cancelled
description: The status of the job.
created_at:
type: string
format: date-time
description: The time the job was created.
finished_at:
type: string
format: date-time
description: The time the job finished.
error:
type: string
description: >-
If status of the job is failed, this will contain the error message.
type:
type: string
const: eval
default: eval
result_files:
type: array
items:
type: string
additionalProperties: false
required:
- id
- status
- created_at
- type
- result_files
title: EvalJob
description: >-
The EvalJob object representing a evaluation job that was created through
API.
AggregationFunctionType:
type: string
enum:
@ -4478,31 +4529,6 @@ components:
required:
- type
title: AnswerSimilarityScoringFnParams
BenchmarkConfig:
type: object
properties:
eval_candidate:
$ref: '#/components/schemas/EvalCandidate'
description: The candidate to evaluate.
scoring_params:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringFnParams'
description: >-
Map between scoring function id and parameters for each scoring function
you want to run
num_examples:
type: integer
description: >-
(Optional) The number of examples to evaluate. If not provided, all examples
in the dataset will be evaluated
additionalProperties: false
required:
- eval_candidate
- scoring_params
title: BenchmarkConfig
description: >-
A benchmark configuration for evaluation.
ContextEntityRecallScoringFnParams:
type: object
properties:
@ -4593,15 +4619,6 @@ components:
required:
- type
title: EqualityScoringFnParams
EvalCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
- $ref: '#/components/schemas/AgentCandidate'
discriminator:
propertyName: type
mapping:
model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate'
FactualityScoringFnParams:
type: object
properties:
@ -4662,31 +4679,6 @@ components:
- type
- judge_model
title: LLMAsJudgeScoringFnParams
ModelCandidate:
type: object
properties:
type:
type: string
const: model
default: model
model:
type: string
description: The model ID to evaluate.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: The sampling parameters for the model.
system_message:
$ref: '#/components/schemas/SystemMessage'
description: >-
(Optional) The system message providing instructions or context to the
model.
additionalProperties: false
required:
- type
- model
- sampling_params
title: ModelCandidate
description: A model candidate for evaluation.
RegexParserMathScoringFnParams:
type: object
properties:
@ -4791,7 +4783,7 @@ components:
EvaluateRowsRequest:
type: object
properties:
input_rows:
dataset_rows:
type: array
items:
type: object
@ -4807,17 +4799,17 @@ components:
scoring_functions:
type: array
items:
type: string
$ref: '#/components/schemas/ScoringFnParams'
description: >-
The scoring functions to use for the evaluation.
benchmark_config:
$ref: '#/components/schemas/BenchmarkConfig'
description: The configuration for the benchmark.
candidate:
$ref: '#/components/schemas/EvalCandidate'
description: The candidate to evaluate on.
additionalProperties: false
required:
- input_rows
- dataset_rows
- scoring_functions
- benchmark_config
- candidate
title: EvaluateRowsRequest
EvaluateResponse:
type: object
@ -5475,21 +5467,20 @@ components:
- checkpoints
title: PostTrainingJobArtifactsResponse
description: Artifacts of a finetuning job.
JobStatus:
type: string
enum:
- completed
- in_progress
- failed
- scheduled
title: JobStatus
PostTrainingJobStatusResponse:
type: object
properties:
job_uuid:
type: string
status:
$ref: '#/components/schemas/JobStatus'
type: string
enum:
- completed
- in_progress
- failed
- scheduled
- cancelled
title: JobStatus
scheduled_at:
type: string
format: date-time
@ -6660,25 +6651,6 @@ components:
required:
- tool_responses
title: ResumeAgentTurnRequest
RunEvalRequest:
type: object
properties:
benchmark_config:
$ref: '#/components/schemas/BenchmarkConfig'
description: The configuration for the benchmark.
additionalProperties: false
required:
- benchmark_config
title: RunEvalRequest
Job:
type: object
properties:
job_id:
type: string
additionalProperties: false
required:
- job_id
title: Job
RunShieldRequest:
type: object
properties:
@ -6732,7 +6704,67 @@ components:
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
ScoreRequest:
ScoreDatasetRequest:
type: object
properties:
dataset_id:
type: string
scoring_functions:
type: array
items:
$ref: '#/components/schemas/ScoringFnParams'
additionalProperties: false
required:
- dataset_id
- scoring_functions
title: ScoreDatasetRequest
ScoringJob:
type: object
properties:
id:
type: string
description: The ID of the job.
status:
type: string
enum:
- completed
- in_progress
- failed
- scheduled
- cancelled
description: The status of the job.
created_at:
type: string
format: date-time
description: The time the job was created.
finished_at:
type: string
format: date-time
description: The time the job finished.
error:
type: string
description: >-
If status of the job is failed, this will contain the error message.
type:
type: string
const: scoring
default: scoring
result_files:
type: array
items:
type: string
additionalProperties: false
required:
- id
- status
- created_at
- type
- result_files
title: ScoringJob
description: >-
The ScoringJob object representing a scoring job that was created through
API.
ScoreRowsRequest:
type: object
properties:
input_rows:
@ -6749,18 +6781,16 @@ components:
- type: object
description: The rows to score.
scoring_functions:
type: object
additionalProperties:
oneOf:
- $ref: '#/components/schemas/ScoringFnParams'
- type: 'null'
type: array
items:
$ref: '#/components/schemas/ScoringFnParams'
description: >-
The scoring functions to use for the scoring.
additionalProperties: false
required:
- input_rows
- scoring_functions
title: ScoreRequest
title: ScoreRowsRequest
ScoreResponse:
type: object
properties:
@ -6775,38 +6805,6 @@ components:
- results
title: ScoreResponse
description: The response from scoring.
ScoreBatchRequest:
type: object
properties:
dataset_id:
type: string
scoring_functions:
type: object
additionalProperties:
oneOf:
- $ref: '#/components/schemas/ScoringFnParams'
- type: 'null'
save_results_dataset:
type: boolean
additionalProperties: false
required:
- dataset_id
- scoring_functions
- save_results_dataset
title: ScoreBatchRequest
ScoreBatchResponse:
type: object
properties:
dataset_id:
type: string
results:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringResult'
additionalProperties: false
required:
- results
title: ScoreBatchResponse
AlgorithmConfig:
oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig'