scoring job

2026-01-05 05:22:16 +00:00 · 2025-03-12 01:16:37 -07:00 · 2025-03-12 01:16:37 -07:00 · 83d8777f56
commit 83d8777f56
parent f88755eb93
2 changed files with 729 additions and 762 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -230,6 +230,108 @@
                }
            }
        },
        "/v1/eval/benchmark/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "EvalJob object indicating its status",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "oneOf": [
                                        {
                                            "$ref": "#/components/schemas/EvalJob"
                                        },
                                        {
                                            "type": "null"
                                        }
                                    ]
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "Get the EvalJob object for a given job id and benchmark id.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "description": "The ID of the job to get the status of.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "Cancel a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "description": "The ID of the job to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/post-training/job/cancel": {
            "post": {
                "responses": {
@ -968,7 +1070,60 @@
                }
            }
        },
-        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmark/{benchmark_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The job that was created to run the evaluation.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvalJob"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "Run an evaluation on a benchmark.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/EvaluateBenchmarkRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/eval/rows": {
            "post": {
                "responses": {
                    "200": {
@ -997,18 +1152,8 @@
                "tags": [
                    "Eval"
                ],
-                "description": "Evaluate a list of rows on a benchmark.",
+                "description": "Evaluate a list of rows on a candidate.",
-                "parameters": [
+                "parameters": [],
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
@ -2194,160 +2339,6 @@
                }
            }
        },
        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "The status of the evaluationjob.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "oneOf": [
                                        {
                                            "$ref": "#/components/schemas/JobStatus"
                                        },
                                        {
                                            "type": "null"
                                        }
                                    ]
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "Get the status of a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "description": "The ID of the job to get the status of.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "Cancel a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "description": "The ID of the job to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
            "get": {
                "responses": {
                    "200": {
                        "description": "The result of the job.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvaluateResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "Get the result of a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "description": "The ID of the job to get the result of.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/agents/{agent_id}/sessions": {
            "get": {
                "responses": {
@ -3430,59 +3421,6 @@
                }
            }
        },
        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The job that was created to run the evaluation.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Job"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "Run an evaluation on a benchmark.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RunEvalRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -3562,7 +3500,50 @@
                }
            }
        },
-        "/v1/scoring/score": {
+        "/v1/scoring/jobs": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ScoringJob"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/ScoreDatasetRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/scoring/rows": {
            "post": {
                "responses": {
                    "200": {
@ -3597,50 +3578,7 @@
                    "content": {
                        "application/json": {
                            "schema": {
-                                "$ref": "#/components/schemas/ScoreRequest"
+                                "$ref": "#/components/schemas/ScoreRowsRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/scoring/score-batch": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ScoreBatchResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/ScoreBatchRequest"
                            }
                        }
                    },
@ -6347,6 +6285,122 @@
                "title": "AgentCandidate",
                "description": "An agent candidate for evaluation."
            },
            "EvalCandidate": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/ModelCandidate"
                    },
                    {
                        "$ref": "#/components/schemas/AgentCandidate"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "model": "#/components/schemas/ModelCandidate",
                        "agent": "#/components/schemas/AgentCandidate"
                    }
                }
            },
            "ModelCandidate": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "model",
                        "default": "model"
                    },
                    "model": {
                        "type": "string",
                        "description": "The model ID to evaluate."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "The sampling parameters for the model."
                    },
                    "system_message": {
                        "$ref": "#/components/schemas/SystemMessage",
                        "description": "(Optional) The system message providing instructions or context to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "model",
                    "sampling_params"
                ],
                "title": "ModelCandidate",
                "description": "A model candidate for evaluation."
            },
            "EvaluateBenchmarkRequest": {
                "type": "object",
                "properties": {
                    "candidate": {
                        "$ref": "#/components/schemas/EvalCandidate",
                        "description": "Candidate to evaluate on. - { \"type\": \"model\", \"model\": \"Llama-3.1-8B-Instruct\", \"sampling_params\": {...}, \"system_message\": \"You are a helpful assistant.\", } - { \"type\": \"agent\", \"config\": {...}, }"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "candidate"
                ],
                "title": "EvaluateBenchmarkRequest"
            },
            "EvalJob": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "The ID of the job."
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "completed",
                            "in_progress",
                            "failed",
                            "scheduled",
                            "cancelled"
                        ],
                        "description": "The status of the job."
                    },
                    "created_at": {
                        "type": "string",
                        "format": "date-time",
                        "description": "The time the job was created."
                    },
                    "finished_at": {
                        "type": "string",
                        "format": "date-time",
                        "description": "The time the job finished."
                    },
                    "error": {
                        "type": "string",
                        "description": "If status of the job is failed, this will contain the error message."
                    },
                    "type": {
                        "type": "string",
                        "const": "eval",
                        "default": "eval"
                    },
                    "result_files": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "status",
                    "created_at",
                    "type",
                    "result_files"
                ],
                "title": "EvalJob",
                "description": "The EvalJob object representing a evaluation job that was created through API."
            },
            "AggregationFunctionType": {
                "type": "string",
                "enum": [
@ -6424,33 +6478,6 @@
                ],
                "title": "AnswerSimilarityScoringFnParams"
            },
            "BenchmarkConfig": {
                "type": "object",
                "properties": {
                    "eval_candidate": {
                        "$ref": "#/components/schemas/EvalCandidate",
                        "description": "The candidate to evaluate."
                    },
                    "scoring_params": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        },
                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
                    },
                    "num_examples": {
                        "type": "integer",
                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "eval_candidate",
                    "scoring_params"
                ],
                "title": "BenchmarkConfig",
                "description": "A benchmark configuration for evaluation."
            },
            "ContextEntityRecallScoringFnParams": {
                "type": "object",
                "properties": {
@ -6561,23 +6588,6 @@
                ],
                "title": "EqualityScoringFnParams"
            },
            "EvalCandidate": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/ModelCandidate"
                    },
                    {
                        "$ref": "#/components/schemas/AgentCandidate"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "model": "#/components/schemas/ModelCandidate",
                        "agent": "#/components/schemas/AgentCandidate"
                    }
                }
            },
            "FactualityScoringFnParams": {
                "type": "object",
                "properties": {
@ -6656,36 +6666,6 @@
                ],
                "title": "LLMAsJudgeScoringFnParams"
            },
            "ModelCandidate": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "model",
                        "default": "model"
                    },
                    "model": {
                        "type": "string",
                        "description": "The model ID to evaluate."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "The sampling parameters for the model."
                    },
                    "system_message": {
                        "$ref": "#/components/schemas/SystemMessage",
                        "description": "(Optional) The system message providing instructions or context to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "model",
                    "sampling_params"
                ],
                "title": "ModelCandidate",
                "description": "A model candidate for evaluation."
            },
            "RegexParserMathScoringFnParams": {
                "type": "object",
                "properties": {
@ -6836,7 +6816,7 @@
            "EvaluateRowsRequest": {
                "type": "object",
                "properties": {
-                    "input_rows": {
+                    "dataset_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
@ -6868,20 +6848,20 @@
                    "scoring_functions": {
                        "type": "array",
                        "items": {
-                            "type": "string"
+                            "$ref": "#/components/schemas/ScoringFnParams"
                        },
                        "description": "The scoring functions to use for the evaluation."
                    },
-                    "benchmark_config": {
+                    "candidate": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "$ref": "#/components/schemas/EvalCandidate",
-                        "description": "The configuration for the benchmark."
+                        "description": "The candidate to evaluate on."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "input_rows",
+                    "dataset_rows",
                    "scoring_functions",
-                    "benchmark_config"
+                    "candidate"
                ],
                "title": "EvaluateRowsRequest"
            },
@ -7941,16 +7921,6 @@
                "title": "PostTrainingJobArtifactsResponse",
                "description": "Artifacts of a finetuning job."
            },
            "JobStatus": {
                "type": "string",
                "enum": [
                    "completed",
                    "in_progress",
                    "failed",
                    "scheduled"
                ],
                "title": "JobStatus"
            },
            "PostTrainingJobStatusResponse": {
                "type": "object",
                "properties": {
@ -7958,7 +7928,15 @@
                        "type": "string"
                    },
                    "status": {
-                        "$ref": "#/components/schemas/JobStatus"
+                        "type": "string",
                        "enum": [
                            "completed",
                            "in_progress",
                            "failed",
                            "scheduled",
                            "cancelled"
                        ],
                        "title": "JobStatus"
                    },
                    "scheduled_at": {
                        "type": "string",
@ -9796,33 +9774,6 @@
                ],
                "title": "ResumeAgentTurnRequest"
            },
            "RunEvalRequest": {
                "type": "object",
                "properties": {
                    "benchmark_config": {
                        "$ref": "#/components/schemas/BenchmarkConfig",
                        "description": "The configuration for the benchmark."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "benchmark_config"
                ],
                "title": "RunEvalRequest"
            },
            "Job": {
                "type": "object",
                "properties": {
                    "job_id": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "job_id"
                ],
                "title": "Job"
            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
@ -9909,7 +9860,82 @@
                ],
                "title": "SaveSpansToDatasetRequest"
            },
-            "ScoreRequest": {
+            "ScoreDatasetRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "scoring_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "scoring_functions"
                ],
                "title": "ScoreDatasetRequest"
            },
            "ScoringJob": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "The ID of the job."
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "completed",
                            "in_progress",
                            "failed",
                            "scheduled",
                            "cancelled"
                        ],
                        "description": "The status of the job."
                    },
                    "created_at": {
                        "type": "string",
                        "format": "date-time",
                        "description": "The time the job was created."
                    },
                    "finished_at": {
                        "type": "string",
                        "format": "date-time",
                        "description": "The time the job finished."
                    },
                    "error": {
                        "type": "string",
                        "description": "If status of the job is failed, this will contain the error message."
                    },
                    "type": {
                        "type": "string",
                        "const": "scoring",
                        "default": "scoring"
                    },
                    "result_files": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "status",
                    "created_at",
                    "type",
                    "result_files"
                ],
                "title": "ScoringJob",
                "description": "The ScoringJob object representing a scoring job that was created through API."
            },
            "ScoreRowsRequest": {
                "type": "object",
                "properties": {
                    "input_rows": {
@ -9942,17 +9968,10 @@
                        "description": "The rows to score."
                    },
                    "scoring_functions": {
-                        "type": "object",
+                        "type": "array",
-                        "additionalProperties": {
+                        "items": {
                            "oneOf": [
                                {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        },
                                {
                                    "type": "null"
                                }
                            ]
                        },
                        "description": "The scoring functions to use for the scoring."
                    }
                },
@ -9961,7 +9980,7 @@
                    "input_rows",
                    "scoring_functions"
                ],
-                "title": "ScoreRequest"
+                "title": "ScoreRowsRequest"
            },
            "ScoreResponse": {
                "type": "object",
@ -9981,56 +10000,6 @@
                "title": "ScoreResponse",
                "description": "The response from scoring."
            },
            "ScoreBatchRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "scoring_functions": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "$ref": "#/components/schemas/ScoringFnParams"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        }
                    },
                    "save_results_dataset": {
                        "type": "boolean"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "scoring_functions",
                    "save_results_dataset"
                ],
                "title": "ScoreBatchRequest"
            },
            "ScoreBatchResponse": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
                "title": "ScoreBatchResponse"
            },
            "AlgorithmConfig": {
                "oneOf": [
                    {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -142,6 +142,76 @@ paths:
            schema:
              $ref: '#/components/schemas/BatchCompletionRequest'
        required: true
  /v1/eval/benchmark/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
          description: EvalJob object indicating its status
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/EvalJob'
                  - type: 'null'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: >-
        Get the EvalJob object for a given job id and benchmark id.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to get the status of.
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: Cancel a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to cancel.
          required: true
          schema:
            type: string
  /v1/post-training/job/cancel:
    post:
      responses:
@ -666,7 +736,44 @@ paths:
            schema:
              $ref: '#/components/schemas/EmbeddingsRequest'
        required: true
-  /v1/eval/benchmarks/{benchmark_id}/evaluations:
+  /v1/eval/benchmark/{benchmark_id}/jobs:
    post:
      responses:
        '200':
          description: >-
            The job that was created to run the evaluation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvalJob'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Run an evaluation on a benchmark.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EvaluateBenchmarkRequest'
        required: true
  /v1/eval/rows:
    post:
      responses:
        '200':
@ -688,15 +795,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: Evaluate a list of rows on a benchmark.
+      description: Evaluate a list of rows on a candidate.
-      parameters:
+      parameters: []
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
@ -1473,111 +1573,6 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
          description: The status of the evaluationjob.
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/JobStatus'
                  - type: 'null'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Get the status of a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to get the status of.
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Cancel a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to cancel.
          required: true
          schema:
            type: string
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
          description: The result of the job.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluateResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Get the result of a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to get the result of.
          required: true
          schema:
            type: string
  /v1/agents/{agent_id}/sessions:
    get:
      responses:
@ -2327,43 +2322,6 @@ paths:
            schema:
              $ref: '#/components/schemas/ResumeAgentTurnRequest'
        required: true
  /v1/eval/benchmarks/{benchmark_id}/jobs:
    post:
      responses:
        '200':
          description: >-
            The job that was created to run the evaluation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Run an evaluation on a benchmark.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1/safety/run-shield:
    post:
      responses:
@ -2418,7 +2376,36 @@ paths:
            schema:
              $ref: '#/components/schemas/SaveSpansToDatasetRequest'
        required: true
-  /v1/scoring/score:
+  /v1/scoring/jobs:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ScoringJob'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: ''
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ScoreDatasetRequest'
        required: true
  /v1/scoring/rows:
    post:
      responses:
        '200':
@ -2446,36 +2433,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/ScoreRequest'
+              $ref: '#/components/schemas/ScoreRowsRequest'
        required: true
  /v1/scoring/score-batch:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ScoreBatchResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: ''
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ScoreBatchRequest'
        required: true
  /v1/post-training/supervised-fine-tune:
    post:
@ -4415,6 +4373,99 @@ components:
        - config
      title: AgentCandidate
      description: An agent candidate for evaluation.
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
        - $ref: '#/components/schemas/AgentCandidate'
      discriminator:
        propertyName: type
        mapping:
          model: '#/components/schemas/ModelCandidate'
          agent: '#/components/schemas/AgentCandidate'
    ModelCandidate:
      type: object
      properties:
        type:
          type: string
          const: model
          default: model
        model:
          type: string
          description: The model ID to evaluate.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: The sampling parameters for the model.
        system_message:
          $ref: '#/components/schemas/SystemMessage'
          description: >-
            (Optional) The system message providing instructions or context to the
            model.
      additionalProperties: false
      required:
        - type
        - model
        - sampling_params
      title: ModelCandidate
      description: A model candidate for evaluation.
    EvaluateBenchmarkRequest:
      type: object
      properties:
        candidate:
          $ref: '#/components/schemas/EvalCandidate'
          description: >-
            Candidate to evaluate on. - { "type": "model", "model": "Llama-3.1-8B-Instruct",
            "sampling_params": {...}, "system_message": "You are a helpful assistant.",
            } - { "type": "agent", "config": {...}, }
      additionalProperties: false
      required:
        - candidate
      title: EvaluateBenchmarkRequest
    EvalJob:
      type: object
      properties:
        id:
          type: string
          description: The ID of the job.
        status:
          type: string
          enum:
            - completed
            - in_progress
            - failed
            - scheduled
            - cancelled
          description: The status of the job.
        created_at:
          type: string
          format: date-time
          description: The time the job was created.
        finished_at:
          type: string
          format: date-time
          description: The time the job finished.
        error:
          type: string
          description: >-
            If status of the job is failed, this will contain the error message.
        type:
          type: string
          const: eval
          default: eval
        result_files:
          type: array
          items:
            type: string
      additionalProperties: false
      required:
        - id
        - status
        - created_at
        - type
        - result_files
      title: EvalJob
      description: >-
        The EvalJob object representing a evaluation job that was created through
        API.
    AggregationFunctionType:
      type: string
      enum:
@ -4478,31 +4529,6 @@ components:
      required:
        - type
      title: AnswerSimilarityScoringFnParams
    BenchmarkConfig:
      type: object
      properties:
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
          description: The candidate to evaluate.
        scoring_params:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringFnParams'
          description: >-
            Map between scoring function id and parameters for each scoring function
            you want to run
        num_examples:
          type: integer
          description: >-
            (Optional) The number of examples to evaluate. If not provided, all examples
            in the dataset will be evaluated
      additionalProperties: false
      required:
        - eval_candidate
        - scoring_params
      title: BenchmarkConfig
      description: >-
        A benchmark configuration for evaluation.
    ContextEntityRecallScoringFnParams:
      type: object
      properties:
@ -4593,15 +4619,6 @@ components:
      required:
        - type
      title: EqualityScoringFnParams
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
        - $ref: '#/components/schemas/AgentCandidate'
      discriminator:
        propertyName: type
        mapping:
          model: '#/components/schemas/ModelCandidate'
          agent: '#/components/schemas/AgentCandidate'
    FactualityScoringFnParams:
      type: object
      properties:
@ -4662,31 +4679,6 @@ components:
        - type
        - judge_model
      title: LLMAsJudgeScoringFnParams
    ModelCandidate:
      type: object
      properties:
        type:
          type: string
          const: model
          default: model
        model:
          type: string
          description: The model ID to evaluate.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: The sampling parameters for the model.
        system_message:
          $ref: '#/components/schemas/SystemMessage'
          description: >-
            (Optional) The system message providing instructions or context to the
            model.
      additionalProperties: false
      required:
        - type
        - model
        - sampling_params
      title: ModelCandidate
      description: A model candidate for evaluation.
    RegexParserMathScoringFnParams:
      type: object
      properties:
@ -4791,7 +4783,7 @@ components:
    EvaluateRowsRequest:
      type: object
      properties:
-        input_rows:
+        dataset_rows:
          type: array
          items:
            type: object
@ -4807,17 +4799,17 @@ components:
        scoring_functions:
          type: array
          items:
-            type: string
+            $ref: '#/components/schemas/ScoringFnParams'
          description: >-
            The scoring functions to use for the evaluation.
-        benchmark_config:
+        candidate:
-          $ref: '#/components/schemas/BenchmarkConfig'
+          $ref: '#/components/schemas/EvalCandidate'
-          description: The configuration for the benchmark.
+          description: The candidate to evaluate on.
      additionalProperties: false
      required:
-        - input_rows
+        - dataset_rows
        - scoring_functions
-        - benchmark_config
+        - candidate
      title: EvaluateRowsRequest
    EvaluateResponse:
      type: object
@ -5475,21 +5467,20 @@ components:
        - checkpoints
      title: PostTrainingJobArtifactsResponse
      description: Artifacts of a finetuning job.
    JobStatus:
      type: string
      enum:
        - completed
        - in_progress
        - failed
        - scheduled
      title: JobStatus
    PostTrainingJobStatusResponse:
      type: object
      properties:
        job_uuid:
          type: string
        status:
-          $ref: '#/components/schemas/JobStatus'
+          type: string
          enum:
            - completed
            - in_progress
            - failed
            - scheduled
            - cancelled
          title: JobStatus
        scheduled_at:
          type: string
          format: date-time
@ -6660,25 +6651,6 @@ components:
      required:
        - tool_responses
      title: ResumeAgentTurnRequest
    RunEvalRequest:
      type: object
      properties:
        benchmark_config:
          $ref: '#/components/schemas/BenchmarkConfig'
          description: The configuration for the benchmark.
      additionalProperties: false
      required:
        - benchmark_config
      title: RunEvalRequest
    Job:
      type: object
      properties:
        job_id:
          type: string
      additionalProperties: false
      required:
        - job_id
      title: Job
    RunShieldRequest:
      type: object
      properties:
@ -6732,7 +6704,67 @@ components:
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
-    ScoreRequest:
+    ScoreDatasetRequest:
      type: object
      properties:
        dataset_id:
          type: string
        scoring_functions:
          type: array
          items:
            $ref: '#/components/schemas/ScoringFnParams'
      additionalProperties: false
      required:
        - dataset_id
        - scoring_functions
      title: ScoreDatasetRequest
    ScoringJob:
      type: object
      properties:
        id:
          type: string
          description: The ID of the job.
        status:
          type: string
          enum:
            - completed
            - in_progress
            - failed
            - scheduled
            - cancelled
          description: The status of the job.
        created_at:
          type: string
          format: date-time
          description: The time the job was created.
        finished_at:
          type: string
          format: date-time
          description: The time the job finished.
        error:
          type: string
          description: >-
            If status of the job is failed, this will contain the error message.
        type:
          type: string
          const: scoring
          default: scoring
        result_files:
          type: array
          items:
            type: string
      additionalProperties: false
      required:
        - id
        - status
        - created_at
        - type
        - result_files
      title: ScoringJob
      description: >-
        The ScoringJob object representing a scoring job that was created through
        API.
    ScoreRowsRequest:
      type: object
      properties:
        input_rows:
@ -6749,18 +6781,16 @@ components:
                - type: object
          description: The rows to score.
        scoring_functions:
-          type: object
+          type: array
-          additionalProperties:
+          items:
-            oneOf:
+            $ref: '#/components/schemas/ScoringFnParams'
              - $ref: '#/components/schemas/ScoringFnParams'
              - type: 'null'
          description: >-
            The scoring functions to use for the scoring.
      additionalProperties: false
      required:
        - input_rows
        - scoring_functions
-      title: ScoreRequest
+      title: ScoreRowsRequest
    ScoreResponse:
      type: object
      properties:
@ -6775,38 +6805,6 @@ components:
        - results
      title: ScoreResponse
      description: The response from scoring.
    ScoreBatchRequest:
      type: object
      properties:
        dataset_id:
          type: string
        scoring_functions:
          type: object
          additionalProperties:
            oneOf:
              - $ref: '#/components/schemas/ScoringFnParams'
              - type: 'null'
        save_results_dataset:
          type: boolean
      additionalProperties: false
      required:
        - dataset_id
        - scoring_functions
        - save_results_dataset
      title: ScoreBatchRequest
    ScoreBatchResponse:
      type: object
      properties:
        dataset_id:
          type: string
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
      additionalProperties: false
      required:
        - results
      title: ScoreBatchResponse
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'