From 83d8777f569fc7c4d10b7508075f4574fb8a8811 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 01:16:37 -0700 Subject: [PATCH] scoring job --- docs/_static/llama-stack-spec.html | 861 ++++++++++++++--------------- docs/_static/llama-stack-spec.yaml | 630 +++++++++++---------- 2 files changed, 729 insertions(+), 762 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 817a65ca8..a472df96b 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -230,6 +230,108 @@ } } }, + "/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}": { + "get": { + "responses": { + "200": { + "description": "EvalJob object indicating its status", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/EvalJob" + }, + { + "type": "null" + } + ] + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "Get the EvalJob object for a given job id and benchmark id.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "description": "The ID of the job to get the status of.", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "Cancel a job.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "description": "The ID of the job to cancel.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/post-training/job/cancel": { "post": { "responses": { @@ -968,7 +1070,60 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/evaluations": { + "/v1/eval/benchmark/{benchmark_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "The job that was created to run the evaluation.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvalJob" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Run an evaluation on a benchmark.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateBenchmarkRequest" + } + } + }, + "required": true + } + } + }, + "/v1/eval/rows": { "post": { "responses": { "200": { @@ -997,18 +1152,8 @@ "tags": [ "Eval" ], - "description": "Evaluate a list of rows on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], + "description": "Evaluate a list of rows on a candidate.", + "parameters": [], "requestBody": { "content": { "application/json": { @@ -2194,160 +2339,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { - "get": { - "responses": { - "200": { - "description": "The status of the evaluationjob.", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/JobStatus" - }, - { - "type": "null" - } - ] - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the status of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the status of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - }, - "delete": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Cancel a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to cancel.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { - "get": { - "responses": { - "200": { - "description": "The result of the job.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the result of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the result of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/agents/{agent_id}/sessions": { "get": { "responses": { @@ -3430,59 +3421,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs": { - "post": { - "responses": { - "200": { - "description": "The job that was created to run the evaluation.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Job" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Run an evaluation on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunEvalRequest" - } - } - }, - "required": true - } - } - }, "/v1/safety/run-shield": { "post": { "responses": { @@ -3562,7 +3500,50 @@ } } }, - "/v1/scoring/score": { + "/v1/scoring/jobs": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoringJob" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreDatasetRequest" + } + } + }, + "required": true + } + } + }, + "/v1/scoring/rows": { "post": { "responses": { "200": { @@ -3597,50 +3578,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ScoreRequest" - } - } - }, - "required": true - } - } - }, - "/v1/scoring/score-batch": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchRequest" + "$ref": "#/components/schemas/ScoreRowsRequest" } } }, @@ -6347,6 +6285,122 @@ "title": "AgentCandidate", "description": "An agent candidate for evaluation." }, + "EvalCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model": { + "type": "string", + "description": "The model ID to evaluate." + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "The sampling parameters for the model." + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage", + "description": "(Optional) The system message providing instructions or context to the model." + } + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ], + "title": "ModelCandidate", + "description": "A model candidate for evaluation." + }, + "EvaluateBenchmarkRequest": { + "type": "object", + "properties": { + "candidate": { + "$ref": "#/components/schemas/EvalCandidate", + "description": "Candidate to evaluate on. - { \"type\": \"model\", \"model\": \"Llama-3.1-8B-Instruct\", \"sampling_params\": {...}, \"system_message\": \"You are a helpful assistant.\", } - { \"type\": \"agent\", \"config\": {...}, }" + } + }, + "additionalProperties": false, + "required": [ + "candidate" + ], + "title": "EvaluateBenchmarkRequest" + }, + "EvalJob": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the job." + }, + "status": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "description": "The status of the job." + }, + "created_at": { + "type": "string", + "format": "date-time", + "description": "The time the job was created." + }, + "finished_at": { + "type": "string", + "format": "date-time", + "description": "The time the job finished." + }, + "error": { + "type": "string", + "description": "If status of the job is failed, this will contain the error message." + }, + "type": { + "type": "string", + "const": "eval", + "default": "eval" + }, + "result_files": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "status", + "created_at", + "type", + "result_files" + ], + "title": "EvalJob", + "description": "The EvalJob object representing a evaluation job that was created through API." + }, "AggregationFunctionType": { "type": "string", "enum": [ @@ -6424,33 +6478,6 @@ ], "title": "AnswerSimilarityScoringFnParams" }, - "BenchmarkConfig": { - "type": "object", - "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate", - "description": "The candidate to evaluate." - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - }, - "description": "Map between scoring function id and parameters for each scoring function you want to run" - }, - "num_examples": { - "type": "integer", - "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ], - "title": "BenchmarkConfig", - "description": "A benchmark configuration for evaluation." - }, "ContextEntityRecallScoringFnParams": { "type": "object", "properties": { @@ -6561,23 +6588,6 @@ ], "title": "EqualityScoringFnParams" }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" - }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, "FactualityScoringFnParams": { "type": "object", "properties": { @@ -6656,36 +6666,6 @@ ], "title": "LLMAsJudgeScoringFnParams" }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { - "type": "string", - "description": "The model ID to evaluate." - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "The sampling parameters for the model." - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage", - "description": "(Optional) The system message providing instructions or context to the model." - } - }, - "additionalProperties": false, - "required": [ - "type", - "model", - "sampling_params" - ], - "title": "ModelCandidate", - "description": "A model candidate for evaluation." - }, "RegexParserMathScoringFnParams": { "type": "object", "properties": { @@ -6836,7 +6816,7 @@ "EvaluateRowsRequest": { "type": "object", "properties": { - "input_rows": { + "dataset_rows": { "type": "array", "items": { "type": "object", @@ -6868,20 +6848,20 @@ "scoring_functions": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/ScoringFnParams" }, "description": "The scoring functions to use for the evaluation." }, - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." + "candidate": { + "$ref": "#/components/schemas/EvalCandidate", + "description": "The candidate to evaluate on." } }, "additionalProperties": false, "required": [ - "input_rows", + "dataset_rows", "scoring_functions", - "benchmark_config" + "candidate" ], "title": "EvaluateRowsRequest" }, @@ -7941,16 +7921,6 @@ "title": "PostTrainingJobArtifactsResponse", "description": "Artifacts of a finetuning job." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ], - "title": "JobStatus" - }, "PostTrainingJobStatusResponse": { "type": "object", "properties": { @@ -7958,7 +7928,15 @@ "type": "string" }, "status": { - "$ref": "#/components/schemas/JobStatus" + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "title": "JobStatus" }, "scheduled_at": { "type": "string", @@ -9796,33 +9774,6 @@ ], "title": "ResumeAgentTurnRequest" }, - "RunEvalRequest": { - "type": "object", - "properties": { - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "benchmark_config" - ], - "title": "RunEvalRequest" - }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ], - "title": "Job" - }, "RunShieldRequest": { "type": "object", "properties": { @@ -9909,7 +9860,82 @@ ], "title": "SaveSpansToDatasetRequest" }, - "ScoreRequest": { + "ScoreDatasetRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoringFnParams" + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "scoring_functions" + ], + "title": "ScoreDatasetRequest" + }, + "ScoringJob": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the job." + }, + "status": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "description": "The status of the job." + }, + "created_at": { + "type": "string", + "format": "date-time", + "description": "The time the job was created." + }, + "finished_at": { + "type": "string", + "format": "date-time", + "description": "The time the job finished." + }, + "error": { + "type": "string", + "description": "If status of the job is failed, this will contain the error message." + }, + "type": { + "type": "string", + "const": "scoring", + "default": "scoring" + }, + "result_files": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "status", + "created_at", + "type", + "result_files" + ], + "title": "ScoringJob", + "description": "The ScoringJob object representing a scoring job that was created through API." + }, + "ScoreRowsRequest": { "type": "object", "properties": { "input_rows": { @@ -9942,16 +9968,9 @@ "description": "The rows to score." }, "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoringFnParams" }, "description": "The scoring functions to use for the scoring." } @@ -9961,7 +9980,7 @@ "input_rows", "scoring_functions" ], - "title": "ScoreRequest" + "title": "ScoreRowsRequest" }, "ScoreResponse": { "type": "object", @@ -9981,56 +10000,6 @@ "title": "ScoreResponse", "description": "The response from scoring." }, - "ScoreBatchRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - } - }, - "save_results_dataset": { - "type": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "scoring_functions", - "save_results_dataset" - ], - "title": "ScoreBatchRequest" - }, - "ScoreBatchResponse": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreBatchResponse" - }, "AlgorithmConfig": { "oneOf": [ { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 62fb02651..39336c4e4 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -142,6 +142,76 @@ paths: schema: $ref: '#/components/schemas/BatchCompletionRequest' required: true + /v1/eval/benchmark/{benchmark_id}/jobs/{job_id}: + get: + responses: + '200': + description: EvalJob object indicating its status + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/EvalJob' + - type: 'null' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: >- + Get the EvalJob object for a given job id and benchmark id. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + - name: job_id + in: path + description: The ID of the job to get the status of. + required: true + schema: + type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: Cancel a job. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + - name: job_id + in: path + description: The ID of the job to cancel. + required: true + schema: + type: string /v1/post-training/job/cancel: post: responses: @@ -666,7 +736,44 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/evaluations: + /v1/eval/benchmark/{benchmark_id}/jobs: + post: + responses: + '200': + description: >- + The job that was created to run the evaluation. + content: + application/json: + schema: + $ref: '#/components/schemas/EvalJob' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Run an evaluation on a benchmark. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateBenchmarkRequest' + required: true + /v1/eval/rows: post: responses: '200': @@ -688,15 +795,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: Evaluate a list of rows on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string + description: Evaluate a list of rows on a candidate. + parameters: [] requestBody: content: application/json: @@ -1473,111 +1573,6 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: - get: - responses: - '200': - description: The status of the evaluationjob. - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/JobStatus' - - type: 'null' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the status of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the status of. - required: true - schema: - type: string - delete: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Cancel a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to cancel. - required: true - schema: - type: string - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: The result of the job. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the result of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the result of. - required: true - schema: - type: string /v1/agents/{agent_id}/sessions: get: responses: @@ -2327,43 +2322,6 @@ paths: schema: $ref: '#/components/schemas/ResumeAgentTurnRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/jobs: - post: - responses: - '200': - description: >- - The job that was created to run the evaluation. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Run an evaluation on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RunEvalRequest' - required: true /v1/safety/run-shield: post: responses: @@ -2418,7 +2376,36 @@ paths: schema: $ref: '#/components/schemas/SaveSpansToDatasetRequest' required: true - /v1/scoring/score: + /v1/scoring/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ScoringJob' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreDatasetRequest' + required: true + /v1/scoring/rows: post: responses: '200': @@ -2446,36 +2433,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ScoreRequest' - required: true - /v1/scoring/score-batch: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchRequest' + $ref: '#/components/schemas/ScoreRowsRequest' required: true /v1/post-training/supervised-fine-tune: post: @@ -4415,6 +4373,99 @@ components: - config title: AgentCandidate description: An agent candidate for evaluation. + EvalCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model: + type: string + description: The model ID to evaluate. + sampling_params: + $ref: '#/components/schemas/SamplingParams' + description: The sampling parameters for the model. + system_message: + $ref: '#/components/schemas/SystemMessage' + description: >- + (Optional) The system message providing instructions or context to the + model. + additionalProperties: false + required: + - type + - model + - sampling_params + title: ModelCandidate + description: A model candidate for evaluation. + EvaluateBenchmarkRequest: + type: object + properties: + candidate: + $ref: '#/components/schemas/EvalCandidate' + description: >- + Candidate to evaluate on. - { "type": "model", "model": "Llama-3.1-8B-Instruct", + "sampling_params": {...}, "system_message": "You are a helpful assistant.", + } - { "type": "agent", "config": {...}, } + additionalProperties: false + required: + - candidate + title: EvaluateBenchmarkRequest + EvalJob: + type: object + properties: + id: + type: string + description: The ID of the job. + status: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + description: The status of the job. + created_at: + type: string + format: date-time + description: The time the job was created. + finished_at: + type: string + format: date-time + description: The time the job finished. + error: + type: string + description: >- + If status of the job is failed, this will contain the error message. + type: + type: string + const: eval + default: eval + result_files: + type: array + items: + type: string + additionalProperties: false + required: + - id + - status + - created_at + - type + - result_files + title: EvalJob + description: >- + The EvalJob object representing a evaluation job that was created through + API. AggregationFunctionType: type: string enum: @@ -4478,31 +4529,6 @@ components: required: - type title: AnswerSimilarityScoringFnParams - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - description: The candidate to evaluate. - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - description: >- - Map between scoring function id and parameters for each scoring function - you want to run - num_examples: - type: integer - description: >- - (Optional) The number of examples to evaluate. If not provided, all examples - in the dataset will be evaluated - additionalProperties: false - required: - - eval_candidate - - scoring_params - title: BenchmarkConfig - description: >- - A benchmark configuration for evaluation. ContextEntityRecallScoringFnParams: type: object properties: @@ -4593,15 +4619,6 @@ components: required: - type title: EqualityScoringFnParams - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' FactualityScoringFnParams: type: object properties: @@ -4662,31 +4679,6 @@ components: - type - judge_model title: LLMAsJudgeScoringFnParams - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - description: The model ID to evaluate. - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model. - system_message: - $ref: '#/components/schemas/SystemMessage' - description: >- - (Optional) The system message providing instructions or context to the - model. - additionalProperties: false - required: - - type - - model - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. RegexParserMathScoringFnParams: type: object properties: @@ -4791,7 +4783,7 @@ components: EvaluateRowsRequest: type: object properties: - input_rows: + dataset_rows: type: array items: type: object @@ -4807,17 +4799,17 @@ components: scoring_functions: type: array items: - type: string + $ref: '#/components/schemas/ScoringFnParams' description: >- The scoring functions to use for the evaluation. - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. + candidate: + $ref: '#/components/schemas/EvalCandidate' + description: The candidate to evaluate on. additionalProperties: false required: - - input_rows + - dataset_rows - scoring_functions - - benchmark_config + - candidate title: EvaluateRowsRequest EvaluateResponse: type: object @@ -5475,21 +5467,20 @@ components: - checkpoints title: PostTrainingJobArtifactsResponse description: Artifacts of a finetuning job. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled - title: JobStatus PostTrainingJobStatusResponse: type: object properties: job_uuid: type: string status: - $ref: '#/components/schemas/JobStatus' + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + title: JobStatus scheduled_at: type: string format: date-time @@ -6660,25 +6651,6 @@ components: required: - tool_responses title: ResumeAgentTurnRequest - RunEvalRequest: - type: object - properties: - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - benchmark_config - title: RunEvalRequest - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id - title: Job RunShieldRequest: type: object properties: @@ -6732,7 +6704,67 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest - ScoreRequest: + ScoreDatasetRequest: + type: object + properties: + dataset_id: + type: string + scoring_functions: + type: array + items: + $ref: '#/components/schemas/ScoringFnParams' + additionalProperties: false + required: + - dataset_id + - scoring_functions + title: ScoreDatasetRequest + ScoringJob: + type: object + properties: + id: + type: string + description: The ID of the job. + status: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + description: The status of the job. + created_at: + type: string + format: date-time + description: The time the job was created. + finished_at: + type: string + format: date-time + description: The time the job finished. + error: + type: string + description: >- + If status of the job is failed, this will contain the error message. + type: + type: string + const: scoring + default: scoring + result_files: + type: array + items: + type: string + additionalProperties: false + required: + - id + - status + - created_at + - type + - result_files + title: ScoringJob + description: >- + The ScoringJob object representing a scoring job that was created through + API. + ScoreRowsRequest: type: object properties: input_rows: @@ -6749,18 +6781,16 @@ components: - type: object description: The rows to score. scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' + type: array + items: + $ref: '#/components/schemas/ScoringFnParams' description: >- The scoring functions to use for the scoring. additionalProperties: false required: - input_rows - scoring_functions - title: ScoreRequest + title: ScoreRowsRequest ScoreResponse: type: object properties: @@ -6775,38 +6805,6 @@ components: - results title: ScoreResponse description: The response from scoring. - ScoreBatchRequest: - type: object - properties: - dataset_id: - type: string - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - save_results_dataset: - type: boolean - additionalProperties: false - required: - - dataset_id - - scoring_functions - - save_results_dataset - title: ScoreBatchRequest - ScoreBatchResponse: - type: object - properties: - dataset_id: - type: string - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - results - title: ScoreBatchResponse AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig'