mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-13 05:17:26 +00:00
scoring job
This commit is contained in:
parent
f88755eb93
commit
83d8777f56
2 changed files with 729 additions and 762 deletions
859
docs/_static/llama-stack-spec.html
vendored
859
docs/_static/llama-stack-spec.html
vendored
|
@ -230,6 +230,108 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}": {
|
||||||
|
"get": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "EvalJob object indicating its status",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/EvalJob"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Scoring"
|
||||||
|
],
|
||||||
|
"description": "Get the EvalJob object for a given job id and benchmark id.",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "benchmark_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the benchmark to run the evaluation on.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "job_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the job to get the status of.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"delete": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK"
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Scoring"
|
||||||
|
],
|
||||||
|
"description": "Cancel a job.",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "benchmark_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the benchmark to run the evaluation on.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "job_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the job to cancel.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
"/v1/post-training/job/cancel": {
|
"/v1/post-training/job/cancel": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -968,7 +1070,60 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/benchmarks/{benchmark_id}/evaluations": {
|
"/v1/eval/benchmark/{benchmark_id}/jobs": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "The job that was created to run the evaluation.",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/EvalJob"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Eval"
|
||||||
|
],
|
||||||
|
"description": "Run an evaluation on a benchmark.",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "benchmark_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the benchmark to run the evaluation on.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/EvaluateBenchmarkRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/v1/eval/rows": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -997,18 +1152,8 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Eval"
|
"Eval"
|
||||||
],
|
],
|
||||||
"description": "Evaluate a list of rows on a benchmark.",
|
"description": "Evaluate a list of rows on a candidate.",
|
||||||
"parameters": [
|
"parameters": [],
|
||||||
{
|
|
||||||
"name": "benchmark_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the benchmark to run the evaluation on.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
"application/json": {
|
"application/json": {
|
||||||
|
@ -2194,160 +2339,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
|
|
||||||
"get": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "The status of the evaluationjob.",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/JobStatus"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Eval"
|
|
||||||
],
|
|
||||||
"description": "Get the status of a job.",
|
|
||||||
"parameters": [
|
|
||||||
{
|
|
||||||
"name": "benchmark_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the benchmark to run the evaluation on.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "job_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the job to get the status of.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"delete": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "OK"
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Eval"
|
|
||||||
],
|
|
||||||
"description": "Cancel a job.",
|
|
||||||
"parameters": [
|
|
||||||
{
|
|
||||||
"name": "benchmark_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the benchmark to run the evaluation on.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "job_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the job to cancel.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
|
|
||||||
"get": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "The result of the job.",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/EvaluateResponse"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Eval"
|
|
||||||
],
|
|
||||||
"description": "Get the result of a job.",
|
|
||||||
"parameters": [
|
|
||||||
{
|
|
||||||
"name": "benchmark_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the benchmark to run the evaluation on.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "job_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the job to get the result of.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/agents/{agent_id}/sessions": {
|
"/v1/agents/{agent_id}/sessions": {
|
||||||
"get": {
|
"get": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -3430,59 +3421,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/benchmarks/{benchmark_id}/jobs": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "The job that was created to run the evaluation.",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/Job"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Eval"
|
|
||||||
],
|
|
||||||
"description": "Run an evaluation on a benchmark.",
|
|
||||||
"parameters": [
|
|
||||||
{
|
|
||||||
"name": "benchmark_id",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The ID of the benchmark to run the evaluation on.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/RunEvalRequest"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/safety/run-shield": {
|
"/v1/safety/run-shield": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -3562,7 +3500,50 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/scoring/score": {
|
"/v1/scoring/jobs": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ScoringJob"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Scoring"
|
||||||
|
],
|
||||||
|
"description": "",
|
||||||
|
"parameters": [],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ScoreDatasetRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/v1/scoring/rows": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -3597,50 +3578,7 @@
|
||||||
"content": {
|
"content": {
|
||||||
"application/json": {
|
"application/json": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"$ref": "#/components/schemas/ScoreRequest"
|
"$ref": "#/components/schemas/ScoreRowsRequest"
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/scoring/score-batch": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "OK",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/ScoreBatchResponse"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Scoring"
|
|
||||||
],
|
|
||||||
"description": "",
|
|
||||||
"parameters": [],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/ScoreBatchRequest"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -6347,6 +6285,122 @@
|
||||||
"title": "AgentCandidate",
|
"title": "AgentCandidate",
|
||||||
"description": "An agent candidate for evaluation."
|
"description": "An agent candidate for evaluation."
|
||||||
},
|
},
|
||||||
|
"EvalCandidate": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/ModelCandidate"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/AgentCandidate"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discriminator": {
|
||||||
|
"propertyName": "type",
|
||||||
|
"mapping": {
|
||||||
|
"model": "#/components/schemas/ModelCandidate",
|
||||||
|
"agent": "#/components/schemas/AgentCandidate"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ModelCandidate": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "model",
|
||||||
|
"default": "model"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The model ID to evaluate."
|
||||||
|
},
|
||||||
|
"sampling_params": {
|
||||||
|
"$ref": "#/components/schemas/SamplingParams",
|
||||||
|
"description": "The sampling parameters for the model."
|
||||||
|
},
|
||||||
|
"system_message": {
|
||||||
|
"$ref": "#/components/schemas/SystemMessage",
|
||||||
|
"description": "(Optional) The system message providing instructions or context to the model."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"model",
|
||||||
|
"sampling_params"
|
||||||
|
],
|
||||||
|
"title": "ModelCandidate",
|
||||||
|
"description": "A model candidate for evaluation."
|
||||||
|
},
|
||||||
|
"EvaluateBenchmarkRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"candidate": {
|
||||||
|
"$ref": "#/components/schemas/EvalCandidate",
|
||||||
|
"description": "Candidate to evaluate on. - { \"type\": \"model\", \"model\": \"Llama-3.1-8B-Instruct\", \"sampling_params\": {...}, \"system_message\": \"You are a helpful assistant.\", } - { \"type\": \"agent\", \"config\": {...}, }"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"candidate"
|
||||||
|
],
|
||||||
|
"title": "EvaluateBenchmarkRequest"
|
||||||
|
},
|
||||||
|
"EvalJob": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The ID of the job."
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"completed",
|
||||||
|
"in_progress",
|
||||||
|
"failed",
|
||||||
|
"scheduled",
|
||||||
|
"cancelled"
|
||||||
|
],
|
||||||
|
"description": "The status of the job."
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "The time the job was created."
|
||||||
|
},
|
||||||
|
"finished_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "The time the job finished."
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "If status of the job is failed, this will contain the error message."
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "eval",
|
||||||
|
"default": "eval"
|
||||||
|
},
|
||||||
|
"result_files": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"status",
|
||||||
|
"created_at",
|
||||||
|
"type",
|
||||||
|
"result_files"
|
||||||
|
],
|
||||||
|
"title": "EvalJob",
|
||||||
|
"description": "The EvalJob object representing a evaluation job that was created through API."
|
||||||
|
},
|
||||||
"AggregationFunctionType": {
|
"AggregationFunctionType": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
|
@ -6424,33 +6478,6 @@
|
||||||
],
|
],
|
||||||
"title": "AnswerSimilarityScoringFnParams"
|
"title": "AnswerSimilarityScoringFnParams"
|
||||||
},
|
},
|
||||||
"BenchmarkConfig": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"eval_candidate": {
|
|
||||||
"$ref": "#/components/schemas/EvalCandidate",
|
|
||||||
"description": "The candidate to evaluate."
|
|
||||||
},
|
|
||||||
"scoring_params": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"$ref": "#/components/schemas/ScoringFnParams"
|
|
||||||
},
|
|
||||||
"description": "Map between scoring function id and parameters for each scoring function you want to run"
|
|
||||||
},
|
|
||||||
"num_examples": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"eval_candidate",
|
|
||||||
"scoring_params"
|
|
||||||
],
|
|
||||||
"title": "BenchmarkConfig",
|
|
||||||
"description": "A benchmark configuration for evaluation."
|
|
||||||
},
|
|
||||||
"ContextEntityRecallScoringFnParams": {
|
"ContextEntityRecallScoringFnParams": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -6561,23 +6588,6 @@
|
||||||
],
|
],
|
||||||
"title": "EqualityScoringFnParams"
|
"title": "EqualityScoringFnParams"
|
||||||
},
|
},
|
||||||
"EvalCandidate": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/ModelCandidate"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/AgentCandidate"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"discriminator": {
|
|
||||||
"propertyName": "type",
|
|
||||||
"mapping": {
|
|
||||||
"model": "#/components/schemas/ModelCandidate",
|
|
||||||
"agent": "#/components/schemas/AgentCandidate"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"FactualityScoringFnParams": {
|
"FactualityScoringFnParams": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -6656,36 +6666,6 @@
|
||||||
],
|
],
|
||||||
"title": "LLMAsJudgeScoringFnParams"
|
"title": "LLMAsJudgeScoringFnParams"
|
||||||
},
|
},
|
||||||
"ModelCandidate": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "model",
|
|
||||||
"default": "model"
|
|
||||||
},
|
|
||||||
"model": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The model ID to evaluate."
|
|
||||||
},
|
|
||||||
"sampling_params": {
|
|
||||||
"$ref": "#/components/schemas/SamplingParams",
|
|
||||||
"description": "The sampling parameters for the model."
|
|
||||||
},
|
|
||||||
"system_message": {
|
|
||||||
"$ref": "#/components/schemas/SystemMessage",
|
|
||||||
"description": "(Optional) The system message providing instructions or context to the model."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"type",
|
|
||||||
"model",
|
|
||||||
"sampling_params"
|
|
||||||
],
|
|
||||||
"title": "ModelCandidate",
|
|
||||||
"description": "A model candidate for evaluation."
|
|
||||||
},
|
|
||||||
"RegexParserMathScoringFnParams": {
|
"RegexParserMathScoringFnParams": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -6836,7 +6816,7 @@
|
||||||
"EvaluateRowsRequest": {
|
"EvaluateRowsRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"input_rows": {
|
"dataset_rows": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -6868,20 +6848,20 @@
|
||||||
"scoring_functions": {
|
"scoring_functions": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"$ref": "#/components/schemas/ScoringFnParams"
|
||||||
},
|
},
|
||||||
"description": "The scoring functions to use for the evaluation."
|
"description": "The scoring functions to use for the evaluation."
|
||||||
},
|
},
|
||||||
"benchmark_config": {
|
"candidate": {
|
||||||
"$ref": "#/components/schemas/BenchmarkConfig",
|
"$ref": "#/components/schemas/EvalCandidate",
|
||||||
"description": "The configuration for the benchmark."
|
"description": "The candidate to evaluate on."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"input_rows",
|
"dataset_rows",
|
||||||
"scoring_functions",
|
"scoring_functions",
|
||||||
"benchmark_config"
|
"candidate"
|
||||||
],
|
],
|
||||||
"title": "EvaluateRowsRequest"
|
"title": "EvaluateRowsRequest"
|
||||||
},
|
},
|
||||||
|
@ -7941,16 +7921,6 @@
|
||||||
"title": "PostTrainingJobArtifactsResponse",
|
"title": "PostTrainingJobArtifactsResponse",
|
||||||
"description": "Artifacts of a finetuning job."
|
"description": "Artifacts of a finetuning job."
|
||||||
},
|
},
|
||||||
"JobStatus": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": [
|
|
||||||
"completed",
|
|
||||||
"in_progress",
|
|
||||||
"failed",
|
|
||||||
"scheduled"
|
|
||||||
],
|
|
||||||
"title": "JobStatus"
|
|
||||||
},
|
|
||||||
"PostTrainingJobStatusResponse": {
|
"PostTrainingJobStatusResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -7958,7 +7928,15 @@
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"status": {
|
"status": {
|
||||||
"$ref": "#/components/schemas/JobStatus"
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"completed",
|
||||||
|
"in_progress",
|
||||||
|
"failed",
|
||||||
|
"scheduled",
|
||||||
|
"cancelled"
|
||||||
|
],
|
||||||
|
"title": "JobStatus"
|
||||||
},
|
},
|
||||||
"scheduled_at": {
|
"scheduled_at": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
|
@ -9796,33 +9774,6 @@
|
||||||
],
|
],
|
||||||
"title": "ResumeAgentTurnRequest"
|
"title": "ResumeAgentTurnRequest"
|
||||||
},
|
},
|
||||||
"RunEvalRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"benchmark_config": {
|
|
||||||
"$ref": "#/components/schemas/BenchmarkConfig",
|
|
||||||
"description": "The configuration for the benchmark."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"benchmark_config"
|
|
||||||
],
|
|
||||||
"title": "RunEvalRequest"
|
|
||||||
},
|
|
||||||
"Job": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"job_id": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"job_id"
|
|
||||||
],
|
|
||||||
"title": "Job"
|
|
||||||
},
|
|
||||||
"RunShieldRequest": {
|
"RunShieldRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -9909,7 +9860,82 @@
|
||||||
],
|
],
|
||||||
"title": "SaveSpansToDatasetRequest"
|
"title": "SaveSpansToDatasetRequest"
|
||||||
},
|
},
|
||||||
"ScoreRequest": {
|
"ScoreDatasetRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"dataset_id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"scoring_functions": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/ScoringFnParams"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"dataset_id",
|
||||||
|
"scoring_functions"
|
||||||
|
],
|
||||||
|
"title": "ScoreDatasetRequest"
|
||||||
|
},
|
||||||
|
"ScoringJob": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The ID of the job."
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"completed",
|
||||||
|
"in_progress",
|
||||||
|
"failed",
|
||||||
|
"scheduled",
|
||||||
|
"cancelled"
|
||||||
|
],
|
||||||
|
"description": "The status of the job."
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "The time the job was created."
|
||||||
|
},
|
||||||
|
"finished_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "The time the job finished."
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "If status of the job is failed, this will contain the error message."
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "scoring",
|
||||||
|
"default": "scoring"
|
||||||
|
},
|
||||||
|
"result_files": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"status",
|
||||||
|
"created_at",
|
||||||
|
"type",
|
||||||
|
"result_files"
|
||||||
|
],
|
||||||
|
"title": "ScoringJob",
|
||||||
|
"description": "The ScoringJob object representing a scoring job that was created through API."
|
||||||
|
},
|
||||||
|
"ScoreRowsRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"input_rows": {
|
"input_rows": {
|
||||||
|
@ -9942,17 +9968,10 @@
|
||||||
"description": "The rows to score."
|
"description": "The rows to score."
|
||||||
},
|
},
|
||||||
"scoring_functions": {
|
"scoring_functions": {
|
||||||
"type": "object",
|
"type": "array",
|
||||||
"additionalProperties": {
|
"items": {
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/ScoringFnParams"
|
"$ref": "#/components/schemas/ScoringFnParams"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"description": "The scoring functions to use for the scoring."
|
"description": "The scoring functions to use for the scoring."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -9961,7 +9980,7 @@
|
||||||
"input_rows",
|
"input_rows",
|
||||||
"scoring_functions"
|
"scoring_functions"
|
||||||
],
|
],
|
||||||
"title": "ScoreRequest"
|
"title": "ScoreRowsRequest"
|
||||||
},
|
},
|
||||||
"ScoreResponse": {
|
"ScoreResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -9981,56 +10000,6 @@
|
||||||
"title": "ScoreResponse",
|
"title": "ScoreResponse",
|
||||||
"description": "The response from scoring."
|
"description": "The response from scoring."
|
||||||
},
|
},
|
||||||
"ScoreBatchRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"dataset_id": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"scoring_functions": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/ScoringFnParams"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"save_results_dataset": {
|
|
||||||
"type": "boolean"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"dataset_id",
|
|
||||||
"scoring_functions",
|
|
||||||
"save_results_dataset"
|
|
||||||
],
|
|
||||||
"title": "ScoreBatchRequest"
|
|
||||||
},
|
|
||||||
"ScoreBatchResponse": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"dataset_id": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"results": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"$ref": "#/components/schemas/ScoringResult"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"results"
|
|
||||||
],
|
|
||||||
"title": "ScoreBatchResponse"
|
|
||||||
},
|
|
||||||
"AlgorithmConfig": {
|
"AlgorithmConfig": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
{
|
{
|
||||||
|
|
630
docs/_static/llama-stack-spec.yaml
vendored
630
docs/_static/llama-stack-spec.yaml
vendored
|
@ -142,6 +142,76 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/BatchCompletionRequest'
|
$ref: '#/components/schemas/BatchCompletionRequest'
|
||||||
required: true
|
required: true
|
||||||
|
/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}:
|
||||||
|
get:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: EvalJob object indicating its status
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/EvalJob'
|
||||||
|
- type: 'null'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Scoring
|
||||||
|
description: >-
|
||||||
|
Get the EvalJob object for a given job id and benchmark id.
|
||||||
|
parameters:
|
||||||
|
- name: benchmark_id
|
||||||
|
in: path
|
||||||
|
description: >-
|
||||||
|
The ID of the benchmark to run the evaluation on.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: job_id
|
||||||
|
in: path
|
||||||
|
description: The ID of the job to get the status of.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
delete:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: OK
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Scoring
|
||||||
|
description: Cancel a job.
|
||||||
|
parameters:
|
||||||
|
- name: benchmark_id
|
||||||
|
in: path
|
||||||
|
description: >-
|
||||||
|
The ID of the benchmark to run the evaluation on.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: job_id
|
||||||
|
in: path
|
||||||
|
description: The ID of the job to cancel.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
/v1/post-training/job/cancel:
|
/v1/post-training/job/cancel:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -666,7 +736,44 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/EmbeddingsRequest'
|
$ref: '#/components/schemas/EmbeddingsRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/eval/benchmarks/{benchmark_id}/evaluations:
|
/v1/eval/benchmark/{benchmark_id}/jobs:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: >-
|
||||||
|
The job that was created to run the evaluation.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/EvalJob'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Eval
|
||||||
|
description: Run an evaluation on a benchmark.
|
||||||
|
parameters:
|
||||||
|
- name: benchmark_id
|
||||||
|
in: path
|
||||||
|
description: >-
|
||||||
|
The ID of the benchmark to run the evaluation on.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/EvaluateBenchmarkRequest'
|
||||||
|
required: true
|
||||||
|
/v1/eval/rows:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -688,15 +795,8 @@ paths:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Eval
|
- Eval
|
||||||
description: Evaluate a list of rows on a benchmark.
|
description: Evaluate a list of rows on a candidate.
|
||||||
parameters:
|
parameters: []
|
||||||
- name: benchmark_id
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The ID of the benchmark to run the evaluation on.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
|
@ -1473,111 +1573,6 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/InvokeToolRequest'
|
$ref: '#/components/schemas/InvokeToolRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
|
|
||||||
get:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: The status of the evaluationjob.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
oneOf:
|
|
||||||
- $ref: '#/components/schemas/JobStatus'
|
|
||||||
- type: 'null'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Eval
|
|
||||||
description: Get the status of a job.
|
|
||||||
parameters:
|
|
||||||
- name: benchmark_id
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The ID of the benchmark to run the evaluation on.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
- name: job_id
|
|
||||||
in: path
|
|
||||||
description: The ID of the job to get the status of.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
delete:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: OK
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Eval
|
|
||||||
description: Cancel a job.
|
|
||||||
parameters:
|
|
||||||
- name: benchmark_id
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The ID of the benchmark to run the evaluation on.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
- name: job_id
|
|
||||||
in: path
|
|
||||||
description: The ID of the job to cancel.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
|
|
||||||
get:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: The result of the job.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/EvaluateResponse'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Eval
|
|
||||||
description: Get the result of a job.
|
|
||||||
parameters:
|
|
||||||
- name: benchmark_id
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The ID of the benchmark to run the evaluation on.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
- name: job_id
|
|
||||||
in: path
|
|
||||||
description: The ID of the job to get the result of.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
/v1/agents/{agent_id}/sessions:
|
/v1/agents/{agent_id}/sessions:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
|
@ -2327,43 +2322,6 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/ResumeAgentTurnRequest'
|
$ref: '#/components/schemas/ResumeAgentTurnRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/eval/benchmarks/{benchmark_id}/jobs:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: >-
|
|
||||||
The job that was created to run the evaluation.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/Job'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Eval
|
|
||||||
description: Run an evaluation on a benchmark.
|
|
||||||
parameters:
|
|
||||||
- name: benchmark_id
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The ID of the benchmark to run the evaluation on.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/RunEvalRequest'
|
|
||||||
required: true
|
|
||||||
/v1/safety/run-shield:
|
/v1/safety/run-shield:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -2418,7 +2376,36 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
|
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/scoring/score:
|
/v1/scoring/jobs:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/ScoringJob'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Scoring
|
||||||
|
description: ''
|
||||||
|
parameters: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/ScoreDatasetRequest'
|
||||||
|
required: true
|
||||||
|
/v1/scoring/rows:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -2446,36 +2433,7 @@ paths:
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/ScoreRequest'
|
$ref: '#/components/schemas/ScoreRowsRequest'
|
||||||
required: true
|
|
||||||
/v1/scoring/score-batch:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: OK
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/ScoreBatchResponse'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Scoring
|
|
||||||
description: ''
|
|
||||||
parameters: []
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/ScoreBatchRequest'
|
|
||||||
required: true
|
required: true
|
||||||
/v1/post-training/supervised-fine-tune:
|
/v1/post-training/supervised-fine-tune:
|
||||||
post:
|
post:
|
||||||
|
@ -4415,6 +4373,99 @@ components:
|
||||||
- config
|
- config
|
||||||
title: AgentCandidate
|
title: AgentCandidate
|
||||||
description: An agent candidate for evaluation.
|
description: An agent candidate for evaluation.
|
||||||
|
EvalCandidate:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/ModelCandidate'
|
||||||
|
- $ref: '#/components/schemas/AgentCandidate'
|
||||||
|
discriminator:
|
||||||
|
propertyName: type
|
||||||
|
mapping:
|
||||||
|
model: '#/components/schemas/ModelCandidate'
|
||||||
|
agent: '#/components/schemas/AgentCandidate'
|
||||||
|
ModelCandidate:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: model
|
||||||
|
default: model
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
description: The model ID to evaluate.
|
||||||
|
sampling_params:
|
||||||
|
$ref: '#/components/schemas/SamplingParams'
|
||||||
|
description: The sampling parameters for the model.
|
||||||
|
system_message:
|
||||||
|
$ref: '#/components/schemas/SystemMessage'
|
||||||
|
description: >-
|
||||||
|
(Optional) The system message providing instructions or context to the
|
||||||
|
model.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- model
|
||||||
|
- sampling_params
|
||||||
|
title: ModelCandidate
|
||||||
|
description: A model candidate for evaluation.
|
||||||
|
EvaluateBenchmarkRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
candidate:
|
||||||
|
$ref: '#/components/schemas/EvalCandidate'
|
||||||
|
description: >-
|
||||||
|
Candidate to evaluate on. - { "type": "model", "model": "Llama-3.1-8B-Instruct",
|
||||||
|
"sampling_params": {...}, "system_message": "You are a helpful assistant.",
|
||||||
|
} - { "type": "agent", "config": {...}, }
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- candidate
|
||||||
|
title: EvaluateBenchmarkRequest
|
||||||
|
EvalJob:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
description: The ID of the job.
|
||||||
|
status:
|
||||||
|
type: string
|
||||||
|
enum:
|
||||||
|
- completed
|
||||||
|
- in_progress
|
||||||
|
- failed
|
||||||
|
- scheduled
|
||||||
|
- cancelled
|
||||||
|
description: The status of the job.
|
||||||
|
created_at:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: The time the job was created.
|
||||||
|
finished_at:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: The time the job finished.
|
||||||
|
error:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
If status of the job is failed, this will contain the error message.
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: eval
|
||||||
|
default: eval
|
||||||
|
result_files:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- status
|
||||||
|
- created_at
|
||||||
|
- type
|
||||||
|
- result_files
|
||||||
|
title: EvalJob
|
||||||
|
description: >-
|
||||||
|
The EvalJob object representing a evaluation job that was created through
|
||||||
|
API.
|
||||||
AggregationFunctionType:
|
AggregationFunctionType:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
|
@ -4478,31 +4529,6 @@ components:
|
||||||
required:
|
required:
|
||||||
- type
|
- type
|
||||||
title: AnswerSimilarityScoringFnParams
|
title: AnswerSimilarityScoringFnParams
|
||||||
BenchmarkConfig:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
eval_candidate:
|
|
||||||
$ref: '#/components/schemas/EvalCandidate'
|
|
||||||
description: The candidate to evaluate.
|
|
||||||
scoring_params:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
$ref: '#/components/schemas/ScoringFnParams'
|
|
||||||
description: >-
|
|
||||||
Map between scoring function id and parameters for each scoring function
|
|
||||||
you want to run
|
|
||||||
num_examples:
|
|
||||||
type: integer
|
|
||||||
description: >-
|
|
||||||
(Optional) The number of examples to evaluate. If not provided, all examples
|
|
||||||
in the dataset will be evaluated
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- eval_candidate
|
|
||||||
- scoring_params
|
|
||||||
title: BenchmarkConfig
|
|
||||||
description: >-
|
|
||||||
A benchmark configuration for evaluation.
|
|
||||||
ContextEntityRecallScoringFnParams:
|
ContextEntityRecallScoringFnParams:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -4593,15 +4619,6 @@ components:
|
||||||
required:
|
required:
|
||||||
- type
|
- type
|
||||||
title: EqualityScoringFnParams
|
title: EqualityScoringFnParams
|
||||||
EvalCandidate:
|
|
||||||
oneOf:
|
|
||||||
- $ref: '#/components/schemas/ModelCandidate'
|
|
||||||
- $ref: '#/components/schemas/AgentCandidate'
|
|
||||||
discriminator:
|
|
||||||
propertyName: type
|
|
||||||
mapping:
|
|
||||||
model: '#/components/schemas/ModelCandidate'
|
|
||||||
agent: '#/components/schemas/AgentCandidate'
|
|
||||||
FactualityScoringFnParams:
|
FactualityScoringFnParams:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -4662,31 +4679,6 @@ components:
|
||||||
- type
|
- type
|
||||||
- judge_model
|
- judge_model
|
||||||
title: LLMAsJudgeScoringFnParams
|
title: LLMAsJudgeScoringFnParams
|
||||||
ModelCandidate:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: model
|
|
||||||
default: model
|
|
||||||
model:
|
|
||||||
type: string
|
|
||||||
description: The model ID to evaluate.
|
|
||||||
sampling_params:
|
|
||||||
$ref: '#/components/schemas/SamplingParams'
|
|
||||||
description: The sampling parameters for the model.
|
|
||||||
system_message:
|
|
||||||
$ref: '#/components/schemas/SystemMessage'
|
|
||||||
description: >-
|
|
||||||
(Optional) The system message providing instructions or context to the
|
|
||||||
model.
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- type
|
|
||||||
- model
|
|
||||||
- sampling_params
|
|
||||||
title: ModelCandidate
|
|
||||||
description: A model candidate for evaluation.
|
|
||||||
RegexParserMathScoringFnParams:
|
RegexParserMathScoringFnParams:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -4791,7 +4783,7 @@ components:
|
||||||
EvaluateRowsRequest:
|
EvaluateRowsRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
input_rows:
|
dataset_rows:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: object
|
type: object
|
||||||
|
@ -4807,17 +4799,17 @@ components:
|
||||||
scoring_functions:
|
scoring_functions:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
$ref: '#/components/schemas/ScoringFnParams'
|
||||||
description: >-
|
description: >-
|
||||||
The scoring functions to use for the evaluation.
|
The scoring functions to use for the evaluation.
|
||||||
benchmark_config:
|
candidate:
|
||||||
$ref: '#/components/schemas/BenchmarkConfig'
|
$ref: '#/components/schemas/EvalCandidate'
|
||||||
description: The configuration for the benchmark.
|
description: The candidate to evaluate on.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input_rows
|
- dataset_rows
|
||||||
- scoring_functions
|
- scoring_functions
|
||||||
- benchmark_config
|
- candidate
|
||||||
title: EvaluateRowsRequest
|
title: EvaluateRowsRequest
|
||||||
EvaluateResponse:
|
EvaluateResponse:
|
||||||
type: object
|
type: object
|
||||||
|
@ -5475,21 +5467,20 @@ components:
|
||||||
- checkpoints
|
- checkpoints
|
||||||
title: PostTrainingJobArtifactsResponse
|
title: PostTrainingJobArtifactsResponse
|
||||||
description: Artifacts of a finetuning job.
|
description: Artifacts of a finetuning job.
|
||||||
JobStatus:
|
|
||||||
type: string
|
|
||||||
enum:
|
|
||||||
- completed
|
|
||||||
- in_progress
|
|
||||||
- failed
|
|
||||||
- scheduled
|
|
||||||
title: JobStatus
|
|
||||||
PostTrainingJobStatusResponse:
|
PostTrainingJobStatusResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
job_uuid:
|
job_uuid:
|
||||||
type: string
|
type: string
|
||||||
status:
|
status:
|
||||||
$ref: '#/components/schemas/JobStatus'
|
type: string
|
||||||
|
enum:
|
||||||
|
- completed
|
||||||
|
- in_progress
|
||||||
|
- failed
|
||||||
|
- scheduled
|
||||||
|
- cancelled
|
||||||
|
title: JobStatus
|
||||||
scheduled_at:
|
scheduled_at:
|
||||||
type: string
|
type: string
|
||||||
format: date-time
|
format: date-time
|
||||||
|
@ -6660,25 +6651,6 @@ components:
|
||||||
required:
|
required:
|
||||||
- tool_responses
|
- tool_responses
|
||||||
title: ResumeAgentTurnRequest
|
title: ResumeAgentTurnRequest
|
||||||
RunEvalRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
benchmark_config:
|
|
||||||
$ref: '#/components/schemas/BenchmarkConfig'
|
|
||||||
description: The configuration for the benchmark.
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- benchmark_config
|
|
||||||
title: RunEvalRequest
|
|
||||||
Job:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
job_id:
|
|
||||||
type: string
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- job_id
|
|
||||||
title: Job
|
|
||||||
RunShieldRequest:
|
RunShieldRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -6732,7 +6704,67 @@ components:
|
||||||
- attributes_to_save
|
- attributes_to_save
|
||||||
- dataset_id
|
- dataset_id
|
||||||
title: SaveSpansToDatasetRequest
|
title: SaveSpansToDatasetRequest
|
||||||
ScoreRequest:
|
ScoreDatasetRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
dataset_id:
|
||||||
|
type: string
|
||||||
|
scoring_functions:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/ScoringFnParams'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- dataset_id
|
||||||
|
- scoring_functions
|
||||||
|
title: ScoreDatasetRequest
|
||||||
|
ScoringJob:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
description: The ID of the job.
|
||||||
|
status:
|
||||||
|
type: string
|
||||||
|
enum:
|
||||||
|
- completed
|
||||||
|
- in_progress
|
||||||
|
- failed
|
||||||
|
- scheduled
|
||||||
|
- cancelled
|
||||||
|
description: The status of the job.
|
||||||
|
created_at:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: The time the job was created.
|
||||||
|
finished_at:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: The time the job finished.
|
||||||
|
error:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
If status of the job is failed, this will contain the error message.
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: scoring
|
||||||
|
default: scoring
|
||||||
|
result_files:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- status
|
||||||
|
- created_at
|
||||||
|
- type
|
||||||
|
- result_files
|
||||||
|
title: ScoringJob
|
||||||
|
description: >-
|
||||||
|
The ScoringJob object representing a scoring job that was created through
|
||||||
|
API.
|
||||||
|
ScoreRowsRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
input_rows:
|
input_rows:
|
||||||
|
@ -6749,18 +6781,16 @@ components:
|
||||||
- type: object
|
- type: object
|
||||||
description: The rows to score.
|
description: The rows to score.
|
||||||
scoring_functions:
|
scoring_functions:
|
||||||
type: object
|
type: array
|
||||||
additionalProperties:
|
items:
|
||||||
oneOf:
|
$ref: '#/components/schemas/ScoringFnParams'
|
||||||
- $ref: '#/components/schemas/ScoringFnParams'
|
|
||||||
- type: 'null'
|
|
||||||
description: >-
|
description: >-
|
||||||
The scoring functions to use for the scoring.
|
The scoring functions to use for the scoring.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input_rows
|
- input_rows
|
||||||
- scoring_functions
|
- scoring_functions
|
||||||
title: ScoreRequest
|
title: ScoreRowsRequest
|
||||||
ScoreResponse:
|
ScoreResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -6775,38 +6805,6 @@ components:
|
||||||
- results
|
- results
|
||||||
title: ScoreResponse
|
title: ScoreResponse
|
||||||
description: The response from scoring.
|
description: The response from scoring.
|
||||||
ScoreBatchRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
dataset_id:
|
|
||||||
type: string
|
|
||||||
scoring_functions:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
oneOf:
|
|
||||||
- $ref: '#/components/schemas/ScoringFnParams'
|
|
||||||
- type: 'null'
|
|
||||||
save_results_dataset:
|
|
||||||
type: boolean
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- dataset_id
|
|
||||||
- scoring_functions
|
|
||||||
- save_results_dataset
|
|
||||||
title: ScoreBatchRequest
|
|
||||||
ScoreBatchResponse:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
dataset_id:
|
|
||||||
type: string
|
|
||||||
results:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
$ref: '#/components/schemas/ScoringResult'
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- results
|
|
||||||
title: ScoreBatchResponse
|
|
||||||
AlgorithmConfig:
|
AlgorithmConfig:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/LoraFinetuningConfig'
|
- $ref: '#/components/schemas/LoraFinetuningConfig'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue